476 files changed, 35603 insertions, 15291 deletions
diff --git a/tests/AssetsLibrary.cpp b/tests/AssetsLibrary.cpp
index 62de78cf26..571b55125b 100644
--- a/tests/AssetsLibrary.cpp
+++ b/tests/AssetsLibrary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -524,13 +524,14 @@ inline void validate_npy_header(std::ifstream &stream, const std::string &expect
     ARM_COMPUTE_UNUSED(expect_typestr);
     ARM_COMPUTE_UNUSED(expect_shape);
 
-    std::string header = npy::read_header(stream);
+    std::string header_s = npy::read_header(stream);
 
     // Parse header
-    std::vector<unsigned long> shape;
-    bool                       fortran_order = false;
-    std::string                typestr;
-    npy::parse_header(header, typestr, fortran_order, shape);
+    npy::header_t header = npy::parse_header(header_s);
+
+    std::vector<unsigned long> shape         = header.shape;
+    bool                       fortran_order = header.fortran_order;
+    std::string                typestr       = header.dtype.str();
 
     // Check if the typestring matches the given one
     ARM_COMPUTE_ERROR_ON_MSG(typestr != expect_typestr, "Typestrings mismatch");
diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h
index f465577372..bd97cb7bd4 100644
--- a/tests/AssetsLibrary.h
+++ b/tests/AssetsLibrary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -406,6 +406,17 @@ public:
     template <typename T, typename DataType>
     void fill_static_values(T &&tensor, const std::vector<DataType> &values) const;
 
+    // Function type to generate a number to fill tensors.
+    template <typename ResultType>
+    using GeneratorFunctionType = std::function<ResultType(void)>;
+    /** Fill a tensor with a value generator function.
+     *
+     * @param[in, out] tensor         To be filled tensor.
+     * @param[in]      generate_value A function that generates values.
+     */
+    template <typename T, typename ResultType>
+    void fill_with_generator(T &&tensor, const GeneratorFunctionType<ResultType> &generate_value) const;
+
 private:
     // Function prototype to convert between image formats.
     using Converter = void (*)(const RawTensor &src, RawTensor &dst);
@@ -413,9 +424,6 @@ private:
     using Extractor = void (*)(const RawTensor &src, RawTensor &dst);
     // Function prototype to load an image file.
     using Loader = RawTensor (*)(const std::string &path);
-    // Function type to generate a number to fill tensors.
-    template <typename ResultType>
-    using GeneratorFunctionType = std::function<ResultType(void)>;
 
     const Converter &get_converter(Format src, Format dst) const;
     const Converter &get_converter(DataType src, Format dst) const;
@@ -460,14 +468,6 @@ private:
      */
     const RawTensor &find_or_create_raw_tensor(const std::string &name, Format format, Channel channel) const;
 
-    /** Fill a tensor with a value generator function.
-     *
-     * @param[in, out] tensor         To be filled tensor.
-     * @param[in]      generate_value A function that generates values.
-     */
-    template <typename T, typename ResultType>
-    void fill_with_generator(T &&tensor, const GeneratorFunctionType<ResultType> &generate_value) const;
-
     mutable TensorCache             _cache{};
     mutable arm_compute::Mutex      _format_lock{};
     mutable arm_compute::Mutex      _channel_lock{};
@@ -725,7 +725,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t
         case DataType::U8:
         case DataType::QASYMM8:
         {
-            std::uniform_int_distribution<uint8_t> distribution_u8(std::numeric_limits<uint8_t>::lowest(), std::numeric_limits<uint8_t>::max());
+            std::uniform_int_distribution<unsigned int> distribution_u8(std::numeric_limits<uint8_t>::lowest(), std::numeric_limits<uint8_t>::max());
             fill(tensor, distribution_u8, seed_offset);
             break;
         }
@@ -734,7 +734,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t
         case DataType::QSYMM8_PER_CHANNEL:
         case DataType::QASYMM8_SIGNED:
         {
-            std::uniform_int_distribution<int8_t> distribution_s8(std::numeric_limits<int8_t>::lowest(), std::numeric_limits<int8_t>::max());
+            std::uniform_int_distribution<int> distribution_s8(std::numeric_limits<int8_t>::lowest(), std::numeric_limits<int8_t>::max());
             fill(tensor, distribution_s8, seed_offset);
             break;
         }
@@ -826,20 +826,20 @@ void AssetsLibrary::fill_tensor_uniform_ranged(T
         case DataType::U8:
         case DataType::QASYMM8:
         {
-            const auto                         converted_pairs = detail::convert_range_pair<uint8_t>(excluded_range_pairs);
-            RangedUniformDistribution<uint8_t> distribution_u8(std::numeric_limits<uint8_t>::lowest(),
-                                                               std::numeric_limits<uint8_t>::max(),
-                                                               converted_pairs);
+            const auto                          converted_pairs = detail::convert_range_pair<uint32_t>(excluded_range_pairs);
+            RangedUniformDistribution<uint32_t> distribution_u8(std::numeric_limits<uint8_t>::lowest(),
+                                                                std::numeric_limits<uint8_t>::max(),
+                                                                converted_pairs);
             fill(tensor, distribution_u8, seed_offset);
             break;
         }
         case DataType::S8:
         case DataType::QSYMM8:
         {
-            const auto                        converted_pairs = detail::convert_range_pair<int8_t>(excluded_range_pairs);
-            RangedUniformDistribution<int8_t> distribution_s8(std::numeric_limits<int8_t>::lowest(),
-                                                              std::numeric_limits<int8_t>::max(),
-                                                              converted_pairs);
+            const auto                         converted_pairs = detail::convert_range_pair<int32_t>(excluded_range_pairs);
+            RangedUniformDistribution<int32_t> distribution_s8(std::numeric_limits<int8_t>::lowest(),
+                                                               std::numeric_limits<int8_t>::max(),
+                                                               converted_pairs);
             fill(tensor, distribution_s8, seed_offset);
             break;
         }
@@ -918,7 +918,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t
         case DataType::QASYMM8:
         {
             ARM_COMPUTE_ERROR_ON(!(std::is_same<uint8_t, D>::value));
-            std::uniform_int_distribution<uint8_t> distribution_u8(low, high);
+            std::uniform_int_distribution<uint32_t> distribution_u8(low, high);
             fill(tensor, distribution_u8, seed_offset);
             break;
         }
@@ -927,7 +927,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t
         case DataType::QASYMM8_SIGNED:
         {
             ARM_COMPUTE_ERROR_ON(!(std::is_same<int8_t, D>::value));
-            std::uniform_int_distribution<int8_t> distribution_s8(low, high);
+            std::uniform_int_distribution<int32_t> distribution_s8(low, high);
             fill(tensor, distribution_s8, seed_offset);
             break;
         }
diff --git a/tests/BUILD.bazel b/tests/BUILD.bazel
new file mode 100644
index 0000000000..5763938d3c
--- /dev/null
+++ b/tests/BUILD.bazel
@@ -0,0 +1,158 @@
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+#---------------------------------------------------------------------
+# Validation Framework Library
+
+cc_library(
+    name = "validation_framework",
+    srcs = glob([
+        "validation/reference/*.cpp",
+        "validation/*.cpp",
+        "*.h",
+    ]),
+    hdrs = glob([
+        "validation/reference/*.h",
+        "validation/**/*.h",
+    ]),
+    copts = [] + select({
+                  "//:arch_armv8-a": ["-march=armv8-a"],
+                  "//:arch_armv8.2-a+fp16": ["-march=armv8.2-a+fp16"],
+                  "//conditions:default": ["-march=armv8-a"],
+              }) + select({
+                "//:debug_flag": [
+                    "-O0",
+                    "-g",
+                    "-gdwarf-2",
+                ],
+                "//conditions:default": ["-O3"],
+            }) +
+            select({
+                "//:openmp_flag": ["-fopenmp"],
+                "//conditions:default": [],
+            }) +
+            select({
+                "//:Werror_flag": ["-Werror"],
+                "//conditions:default": [],
+            }),
+    linkstatic = True,
+    deps = [
+        "//:arm_compute",
+        "//:common_defines",
+        "//tests/framework",
+    ],
+)
+
+#---------------------------------------------------------------------
+# Validation Binary
+cc_binary(
+    name = "arm_compute_validation",
+    srcs = glob([
+        "validation/UNIT/**/*.cpp",
+        "validation/CPP/**/*.cpp",
+        "NEON/*.h",
+        "validation/NEON/**/*.cpp",
+        "validation/NEON/**/*.h",
+        "*.cpp",
+        "datasets/*.h",
+        "instruments/*.h",
+    ]),
+    copts = [] + select({
+                  "//:arch_armv8-a": ["-march=armv8-a"],
+                  "//:arch_armv8.2-a+fp16": ["-march=armv8.2-a+fp16"],
+                  "//conditions:default": ["-march=armv8-a"],
+              }) + select({
+                "//:debug_flag": [
+                    "-O0",
+                    "-g",
+                    "-gdwarf-2",
+                ],
+                "//conditions:default": ["-O3"],
+            }) +
+            select({
+                "//:openmp_flag": ["-fopenmp"],
+                "//conditions:default": [],
+            }) +
+            select({
+                "//:Werror_flag": ["-Werror"],
+                "//conditions:default": [],
+            }),
+    linkstatic = True,
+    deps = [
+        ":validation_framework",
+        "//:arm_compute",
+        "//:arm_compute_graph",
+        "//:common_defines",
+        "//tests/framework",
+    ],
+    local_defines = [] +
+        select({
+                "//:bf16_validation_flag": [
+                "ARM_COMPUTE_ENABLE_BF16",
+                ],
+                "//conditions:default": [],
+              }) +
+        select({
+                "//:sve_validation_flag": [
+                "ENABLE_SVE",
+                "ARM_COMPUTE_ENABLE_SVE",
+                ],
+                "//conditions:default": [],
+              })
+)
+
+#---------------------------------------------------------------------
+# Benchmark Binary
+cc_binary(
+    name = "arm_benchmark",
+    srcs = glob([
+        "benchmark/fixtures/*.h",
+        "benchmark/NEON/*.cpp",
+        "*.cpp",
+    ]),
+    copts = [] + select({
+                  "//:arch_armv8-a": ["-march=armv8-a"],
+                  "//:arch_armv8.2-a+fp16": ["-march=armv8.2-a+fp16"],
+                  "//conditions:default": ["-march=armv8-a"],
+              }) + select({
+                "//:debug_flag": [
+                    "-O0",
+                    "-g",
+                    "-gdwarf-2",
+                ],
+                "//conditions:default": ["-O3"],
+            }) +
+            select({
+                "//:openmp_flag": ["-fopenmp"],
+                "//conditions:default": [],
+            }) +
+            select({
+                "//:Werror_flag": ["-Werror"],
+                "//conditions:default": [],
+            }),
+    linkstatic = True,
+    deps = [
+        ":arm_compute_validation",
+        ":validation_framework",
+        "//:arm_compute",
+    ],
+)
diff --git a/tests/CL/CLHOGAccessor.h b/tests/CL/CLHOGAccessor.h
deleted file mode 100644
index 2b594955f6..0000000000
--- a/tests/CL/CLHOGAccessor.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_CLHOGACCESSOR_H
-#define ARM_COMPUTE_TEST_CLHOGACCESSOR_H
-
-#include "arm_compute/runtime/CL/CLHOG.h"
-#include "tests/IHOGAccessor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-/** Accessor implementation for @ref CLHOG objects. */
-class CLHOGAccessor : public IHOGAccessor
-{
-public:
-    /** Create an accessor for the given @p CLHOG. */
-    CLHOGAccessor(CLHOG &hog)
-        : _hog{ hog }
-    {
-        _hog.map();
-    }
-
-    /** Destructor that unmaps the CL memory. */
-    ~CLHOGAccessor()
-    {
-        _hog.unmap();
-    }
-
-    /** Prevent instances of this class from being copied (As this class contains references). */
-    CLHOGAccessor(const CLHOGAccessor &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains references). */
-    CLHOGAccessor &operator=(const CLHOGAccessor &) = delete;
-
-    /** Pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
-     *
-     * @return A pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
-     */
-    float *descriptor() const override
-    {
-        return _hog.descriptor();
-    }
-
-private:
-    CLHOG &_hog;
-};
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CLHOGACCESSOR_H */
diff --git a/tests/CL/CLLutAccessor.h b/tests/CL/CLLutAccessor.h
deleted file mode 100644
index 78cd85d405..0000000000
--- a/tests/CL/CLLutAccessor.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_CL_CLLUTACCESSOR_H
-#define ARM_COMPUTE_TEST_CL_CLLUTACCESSOR_H
-
-#include "tests/ILutAccessor.h"
-
-#include "arm_compute/runtime/CL/CLLut.h"
-
-namespace arm_compute
-{
-namespace test
-{
-/** Accessor implementation for @ref CLLut objects. */
-template <typename T>
-class CLLutAccessor : public ILutAccessor<T>
-{
-public:
-    /** Create an accessor for the given @p CLLut.
-     */
-    CLLutAccessor(CLLut &lut)
-        : _lut{ lut }
-    {
-        _lut.map(true);
-    }
-    /** Default destructor */
-    ~CLLutAccessor()
-    {
-        _lut.unmap();
-    }
-
-    /** Prevent instances of this class from being copy constructed */
-    CLLutAccessor(const CLLutAccessor &) = delete;
-    /** Prevent instances of this class from being copied */
-    CLLutAccessor &operator=(const CLLutAccessor &) = delete;
-    /** Allow instances of this class to be move constructed */
-    CLLutAccessor(CLLutAccessor &&) = default;
-    /** Allow instance of this class to be moved */
-    CLLutAccessor &operator=(CLLutAccessor &&) = default;
-
-    int num_elements() const override
-    {
-        return _lut.num_elements();
-    }
-
-    const T &operator[](T input_value) const override
-    {
-        auto    lut        = reinterpret_cast<T *>(_lut.buffer());
-        int32_t real_index = _lut.index_offset() + static_cast<int32_t>(input_value);
-
-        if(0 <= real_index && real_index < num_elements())
-        {
-            return lut[real_index];
-        }
-        ARM_COMPUTE_ERROR("Error index not in range.");
-    }
-
-    T &operator[](T input_value) override
-    {
-        auto    lut        = reinterpret_cast<T *>(_lut.buffer());
-        int32_t real_index = _lut.index_offset() + static_cast<int32_t>(input_value);
-
-        if(0 <= real_index && real_index < num_elements())
-        {
-            return lut[real_index];
-        }
-        ARM_COMPUTE_ERROR("Error index not in range.");
-    }
-
-private:
-    CLLut &_lut;
-};
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CL_CLLUTACCESSOR_H */
diff --git a/tests/CL/Helper.h b/tests/CL/Helper.h
index 5153e98add..dd5e8647b0 100644
--- a/tests/CL/Helper.h
+++ b/tests/CL/Helper.h
@@ -29,8 +29,11 @@
 #include "arm_compute/runtime/CL/functions/CLFill.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/operators/ClFill.h"
 
 #include "src/core/CL/ICLKernel.h"
+#include "support/Cast.h"
 
 #include <memory>
 
@@ -38,6 +41,86 @@ namespace arm_compute
 {
 namespace test
 {
+/** This template synthetizes a simple IOperator which runs the given kernel K */
+template <typename K>
+class CLSynthetizeOperator : public opencl::IClOperator
+{
+public:
+    /** Configure the kernel.
+     *
+     * @param[in] args Configuration arguments.
+     */
+    template <typename... Args>
+    void configure(Args &&... args)
+    {
+        auto k = std::make_unique<K>();
+        k->configure(CLKernelLibrary::get().get_compile_context(), std::forward<Args>(args)...);
+        _kernel = std::move(k);
+    }
+    /** Configure the kernel setting the GPU target as well
+     *
+     * @param[in] gpu_target GPUTarget to set
+     * @param[in] args       Configuration arguments.
+     */
+    template <typename... Args>
+    void configure(GPUTarget gpu_target, Args &&... args)
+    {
+        auto k = std::make_unique<K>();
+        k->set_target(gpu_target);
+        k->configure(CLKernelLibrary::get().get_compile_context(), std::forward<Args>(args)...);
+        _kernel = std::move(k);
+    }
+    /** Validate input arguments
+     *
+     * @param[in] args Configuration arguments.
+     */
+    template <typename... Args>
+    static Status validate(Args &&... args)
+    {
+        return K::validate(std::forward<Args>(args)...);
+    }
+};
+
+/** As above but this also initializes to zero the input tensor */
+template <typename K, int bordersize>
+class CLSynthetizeOperatorInitOutputWithZeroAndWithZeroConstantBorder : public opencl::IClOperator
+{
+public:
+    /** Configure the kernel.
+     *
+     * @param[in] first  First input argument.
+     * @param[in] second Second input argument.
+     * @param[in] args   Rest of the configuration arguments.
+     */
+    template <typename T, typename... Args>
+    void configure(T first, T second, Args &&... args)
+    {
+        auto cctx = CLKernelLibrary::get().get_compile_context();
+        auto k    = std::make_unique<K>();
+        k->set_target(CLScheduler::get().target());
+        k->configure(cctx, first, second, std::forward<Args>(args)...);
+        _kernel = std::move(k);
+        _border_handler.configure(cctx, first, BorderSize(bordersize), BorderMode::CONSTANT, PixelValue());
+        _fill.configure(cctx, second, PixelValue());
+    }
+
+    // Inherited method overridden:
+    void run(ITensorPack &tensors) override final
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The CL kernel or function isn't configured");
+
+        ITensorPack fill_pack = { { ACL_SRC, tensors.get_tensor(TensorType::ACL_DST) } };
+        _fill.run(fill_pack);
+        CLScheduler::get().enqueue_op(_border_handler, tensors);
+        CLScheduler::get().enqueue_op(*_kernel, tensors);
+    }
+
+private:
+    opencl::ClFill             _fill{};           /**< Kernel to initialize the tensor */
+    CLFillBorderKernel         _border_handler{}; /**< Kernel to handle  borders */
+    std::unique_ptr<ICLKernel> _kernel{};         /**< Kernel to run */
+};
+
 /** This template synthetizes an ICLSimpleFunction which runs the given kernel K */
 template <typename K>
 class CLSynthetizeFunction : public ICLSimpleFunction
@@ -135,6 +218,39 @@ private:
     CLFillBorderKernel         _border_handler{}; /**< Kernel to handle  borders */
     std::unique_ptr<ICLKernel> _kernel{};         /**< Kernel to run */
 };
+
+/** As above but this also setups a Zero border on the input tensor of the kernel's bordersize */
+template <typename K>
+class ClSynthetizeOperatorWithBorder : public opencl::IClOperator
+{
+public:
+    /** Configure the kernel.
+     *
+     * @param[in] first First configuration argument.
+     * @param[in] args  Rest of the configuration arguments.
+     */
+    template <typename T, typename... Args>
+    void configure(T first, Args &&... args)
+    {
+        auto k = std::make_unique<K>();
+        k->configure(CLKernelLibrary::get().get_compile_context(), first, std::forward<Args>(args)...);
+        _kernel = std::move(k);
+
+        auto b = std::make_unique<CLFillBorderKernel>();
+        b->configure(CLKernelLibrary::get().get_compile_context(), first, BorderSize(_kernel->border_size()), BorderMode::CONSTANT, PixelValue());
+        _border_handler = std::move(b);
+    }
+
+    void run(ITensorPack &tensors) override
+    {
+        CLScheduler::get().enqueue(*_border_handler);
+        CLScheduler::get().enqueue_op(*_kernel, tensors);
+    }
+
+private:
+    std::unique_ptr<ICLKernel> _border_handler{ nullptr }; /**< Kernel to handle  borders */
+    std::unique_ptr<ICLKernel> _kernel{};                  /**< Kernel to run */
+};
 } // namespace test
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_TEST_CL_HELPER_H */
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000000..20a010f38c
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,131 @@
+# Copyright (c) 2023-2024 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+target_sources(
+  arm_compute_validation_framework
+  PRIVATE validation/Validation.cpp
+          validation/Helpers.cpp
+          validation/reference/BoundingBoxTransform.cpp
+          validation/reference/GEMMReshapeRHSMatrix.cpp
+          validation/reference/ChannelShuffle.cpp
+          validation/reference/Logical.cpp
+          validation/reference/PoolingLayer.cpp
+          validation/reference/BitwiseNot.cpp
+          validation/reference/Conv3D.cpp
+          validation/reference/GEMMReshapeLHSMatrix.cpp
+          validation/reference/ComputeAllAnchors.cpp
+          validation/reference/DepthConcatenateLayer.cpp
+          validation/reference/TableLookup.cpp
+          validation/reference/ROIPoolingLayer.cpp
+          validation/reference/SliceOperations.cpp
+          validation/reference/GEMMLowp.cpp
+          validation/reference/Unstack.cpp
+          validation/reference/Pooling3dLayer.cpp
+          validation/reference/BitwiseOr.cpp
+          validation/reference/ReshapeLayer.cpp
+          validation/reference/SoftmaxLayer.cpp
+          validation/reference/Gather.cpp
+          validation/reference/Utils.cpp
+          validation/reference/Accumulate.cpp
+          validation/reference/CropResize.cpp
+          validation/reference/ReductionOperation.cpp
+          validation/reference/ConcatenateLayer.cpp
+          validation/reference/PixelWiseMultiplication.cpp
+          validation/reference/DepthConvertLayer.cpp
+          validation/reference/Erode.cpp
+          validation/reference/DepthToSpaceLayer.cpp
+          validation/reference/PadLayer.cpp
+          validation/reference/MeanStdDevNormalizationLayer.cpp
+          validation/reference/BitwiseXor.cpp
+          validation/reference/GEMM.cpp
+          validation/reference/NormalizePlanarYUVLayer.cpp
+          validation/reference/FuseBatchNormalization.cpp
+          validation/reference/BitwiseAnd.cpp
+          validation/reference/SpaceToDepth.cpp
+          validation/reference/NonMaximaSuppression.cpp
+          validation/reference/Reverse.cpp
+          validation/reference/DFT.cpp
+          validation/reference/L2NormalizeLayer.cpp
+          validation/reference/ActivationLayer.cpp
+          validation/reference/SpaceToBatch.cpp
+          validation/reference/Im2Col.cpp
+          validation/reference/DequantizationLayer.cpp
+          validation/reference/DeconvolutionLayer.cpp
+          validation/reference/MinMaxLocation.cpp
+          validation/reference/Select.cpp
+          validation/reference/BatchNormalizationLayer.cpp
+          validation/reference/InstanceNormalizationLayer.cpp
+          validation/reference/ROIAlignLayer.cpp
+          validation/reference/ElementwiseUnary.cpp
+          validation/reference/MeanStdDev.cpp
+          validation/reference/QLSTMLayerNormalization.cpp
+          validation/reference/Col2Im.cpp
+          validation/reference/FlattenLayer.cpp
+          validation/reference/AbsoluteDifference.cpp
+          validation/reference/Transpose.cpp
+          validation/reference/StackLayer.cpp
+          validation/reference/NormalizationLayer.cpp
+          validation/reference/Copy.cpp
+          validation/reference/MaxUnpoolingLayer.cpp
+          validation/reference/Winograd.cpp
+          validation/reference/Permute.cpp
+          validation/reference/Comparisons.cpp
+          validation/reference/Tile.cpp
+          validation/reference/BatchToSpaceLayer.cpp
+          validation/reference/ElementwiseOperations.cpp
+          validation/reference/QuantizationLayer.cpp
+          validation/reference/NonMaxSuppression.cpp
+          validation/reference/WeightsReshape.cpp
+          validation/reference/ArithmeticOperations.cpp
+          validation/reference/ConvertFullyConnectedWeights.cpp
+          validation/reference/Floor.cpp
+          validation/reference/PriorBoxLayer.cpp
+          validation/reference/Scale.cpp
+          validation/reference/ScatterLayer.cpp
+          validation/reference/ReorgLayer.cpp
+          validation/reference/Range.cpp
+          validation/reference/ArithmeticDivision.cpp
+          validation/reference/DepthwiseConvolutionLayer.cpp
+          validation/reference/FullyConnectedLayer.cpp
+          validation/reference/ConvolutionLayer.cpp
+          validation/reference/Reorder.cpp
+          framework/Framework.cpp
+          framework/Utils.cpp
+          framework/Exceptions.cpp
+          framework/DatasetModes.cpp
+          framework/TestFilter.cpp
+          framework/Profiler.cpp
+          framework/ParametersLibrary.cpp
+          framework/command_line/CommonOptions.cpp
+          framework/instruments/WallClockTimer.cpp
+          framework/instruments/InstrumentsStats.cpp
+          framework/instruments/Instruments.cpp
+          framework/instruments/SchedulerTimer.cpp
+          framework/instruments/hwc_names.hpp
+          framework/instruments/hwc.hpp
+          framework/printers/PrettyPrinter.cpp
+          framework/printers/Printer.cpp
+          framework/printers/JSONPrinter.cpp
+          framework/printers/Printers.cpp
+          AssetsLibrary.cpp
+          RawTensor.cpp
+          main.cpp)
diff --git a/tests/IAccessor.h b/tests/IAccessor.h
index c54c00e99e..75faee19ce 100644
--- a/tests/IAccessor.h
+++ b/tests/IAccessor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/QuantizationInfo.h"
 
 namespace arm_compute
 {
diff --git a/tests/IHOGAccessor.h b/tests/IHOGAccessor.h
deleted file mode 100644
index f1c137c2ed..0000000000
--- a/tests/IHOGAccessor.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_IHOGACCESSOR_H
-#define ARM_COMPUTE_TEST_IHOGACCESSOR_H
-
-namespace arm_compute
-{
-namespace test
-{
-/** Common interface to access HOG structure */
-class IHOGAccessor
-{
-public:
-    /** Virtual destructor. */
-    virtual ~IHOGAccessor() = default;
-
-    /** Pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
-     *
-     * @note Other elements of the array can be accessed using descriptor()[idx] for idx=[0, descriptor_size() - 1]
-     *
-     * @return A pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
-     */
-    virtual float *descriptor() const = 0;
-};
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_IHOGACCESSOR_H */
diff --git a/tests/NEON/HOGAccessor.h b/tests/NEON/HOGAccessor.h
deleted file mode 100644
index 735abb08d3..0000000000
--- a/tests/NEON/HOGAccessor.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_HOGACCESSOR_H
-#define ARM_COMPUTE_TEST_HOGACCESSOR_H
-
-#include "arm_compute/runtime/HOG.h"
-#include "tests/IHOGAccessor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-/** Accessor implementation for @ref HOG objects. */
-class HOGAccessor : public IHOGAccessor
-{
-public:
-    /** Create an accessor for the given @p HOG.
-     */
-    HOGAccessor(HOG &hog)
-        : _hog{ hog }
-    {
-    }
-
-    /** Prevent instances of this class from being copied (As this class contains references). */
-    HOGAccessor(const HOGAccessor &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains references). */
-    HOGAccessor &operator=(const HOGAccessor &) = delete;
-
-    /** Pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
-     *
-     * @return A pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
-     */
-    float *descriptor() const override
-    {
-        return _hog.descriptor();
-    }
-
-private:
-    HOG &_hog;
-};
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HOGACCESSOR_H */
diff --git a/tests/NEON/Helper.h b/tests/NEON/Helper.h
index 714152ebcd..fb0231b62a 100644
--- a/tests/NEON/Helper.h
+++ b/tests/NEON/Helper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,9 @@
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/cpu/ICpuOperator.h"
 #include "tests/Globals.h"
 
 #include <algorithm>
@@ -104,7 +106,7 @@ public:
 
 /** As above but this also setups a Zero border on the input tensor of the kernel's bordersize */
 template <typename K>
-class NESynthetizeFunctionWithZeroConstantKernelBorder : public INESimpleFunction
+class NESynthetizeFunctionWithZeroConstantKernelBorder : public cpu::ICpuOperator
 {
 public:
     /** Configure the kernel.
@@ -123,6 +125,15 @@ public:
         b->configure(first, BorderSize(_kernel->border_size()), BorderMode::CONSTANT, PixelValue());
         _border_handler = std::move(b);
     }
+
+    void run(ITensorPack &tensors)
+    {
+        NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
+        NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+    }
+
+private:
+    std::unique_ptr<INEKernel> _border_handler{ nullptr };
 };
 
 } // namespace test
diff --git a/tests/NEON/LutAccessor.h b/tests/NEON/LutAccessor.h
deleted file mode 100644
index 5204d0640c..0000000000
--- a/tests/NEON/LutAccessor.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_NEON_LUTACCESSOR_H
-#define ARM_COMPUTE_TEST_NEON_LUTACCESSOR_H
-
-#include "tests/ILutAccessor.h"
-
-#include "arm_compute/runtime/Lut.h"
-
-namespace arm_compute
-{
-namespace test
-{
-/** Accessor implementation for @ref Lut objects. */
-template <typename T>
-class LutAccessor : public ILutAccessor<T>
-{
-public:
-    /** Create an accessor for the given @p Lut.
-     */
-    LutAccessor(Lut &lut)
-        : _lut{ lut }
-    {
-    }
-
-    /** Prevent instances of this class from being copy constructed */
-    LutAccessor(const LutAccessor &) = delete;
-    /** Prevent instances of this class from being copied */
-    LutAccessor &operator=(const LutAccessor &) = delete;
-    /** Allow instances of this class to be move constructed */
-    LutAccessor(LutAccessor &&) = default;
-    /** Allow instances of this class to be moved */
-    LutAccessor &operator=(LutAccessor &&) = default;
-
-    int num_elements() const override
-    {
-        return _lut.num_elements();
-    }
-
-    const T &operator[](T input_value) const override
-    {
-        auto    lut        = reinterpret_cast<T *>(_lut.buffer());
-        int32_t real_index = _lut.index_offset() + static_cast<int32_t>(input_value);
-
-        if(0 <= real_index && real_index < num_elements())
-        {
-            return lut[real_index];
-        }
-        ARM_COMPUTE_ERROR("Error index not in range.");
-    }
-
-    T &operator[](T input_value) override
-    {
-        auto    lut        = reinterpret_cast<T *>(_lut.buffer());
-        int32_t real_index = _lut.index_offset() + static_cast<int32_t>(input_value);
-
-        if(0 <= real_index && real_index < num_elements())
-        {
-            return lut[real_index];
-        }
-        ARM_COMPUTE_ERROR("Error index not in range.");
-    }
-
-private:
-    ILut &_lut;
-};
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_NEON_LUTACCESSOR_H */
diff --git a/tests/RawLutAccessor.h b/tests/RawLutAccessor.h
deleted file mode 100644
index 4318fb2dcc..0000000000
--- a/tests/RawLutAccessor.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_RAWLUTACCESSOR_H
-#define ARM_COMPUTE_TEST_RAWLUTACCESSOR_H
-
-#include "ILutAccessor.h"
-
-#include <map>
-
-namespace arm_compute
-{
-namespace test
-{
-/** Accessor implementation for std::map-lut objects. */
-template <typename T>
-class RawLutAccessor : public ILutAccessor<T>
-{
-public:
-    /** Create an accessor for the given @p std::map.
-     */
-    RawLutAccessor(std::map<T, T> &lut)
-        : _lut{ lut }
-    {
-    }
-
-    /** Prevent instances of this class from being copy constructed */
-    RawLutAccessor(const RawLutAccessor &) = delete;
-    /** Prevent instances of this class from being copied */
-    RawLutAccessor &operator=(const RawLutAccessor &) = delete;
-    /** Allow instances of this class to be move constructed */
-    RawLutAccessor(RawLutAccessor &&) = default;
-    /** Allow instances of this class to be moved */
-    RawLutAccessor &operator=(RawLutAccessor &&) = default;
-
-    int num_elements() const override
-    {
-        return _lut.size();
-    }
-
-    const T &operator[](T input_value) const override
-    {
-        return _lut[input_value];
-    }
-
-    T &operator[](T input_value) override
-    {
-        return _lut[input_value];
-    }
-
-private:
-    std::map<T, T> &_lut;
-};
-
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_RAWLUTACCESSOR_H */
diff --git a/tests/SConscript b/tests/SConscript
index fea68e0fe9..0907c5713b 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -1,4 +1,7 @@
-# Copyright (c) 2017-2019 Arm Limited.
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2017-2023,2024 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -19,7 +22,6 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-import SCons
 import os.path
 
 Import('env')
@@ -28,12 +30,11 @@ Import('install_bin')
 
 # vars is imported from arm_compute:
 variables = [
-    BoolVariable("benchmark_examples", "Build benchmark examples programs", True),
-    BoolVariable("validate_examples", "Build validate examples programs", True),
+    BoolVariable("benchmark_examples", "Build benchmark examples programs", False),
+    BoolVariable("validate_examples", "Build validate examples programs", False),
     BoolVariable("reference_openmp", "Build reference validation with openmp", True),
-    #FIXME Switch the following two options to False before releasing
-    BoolVariable("validation_tests", "Build validation test programs", True),
-    BoolVariable("benchmark_tests", "Build benchmark test programs", True),
+    BoolVariable("validation_tests", "Build validation test programs", False),
+    BoolVariable("benchmark_tests", "Build benchmark test programs", False),
     ("test_filter", "Pattern to specify the tests' filenames to be compiled", "*.cpp")
 ]
 
@@ -67,7 +68,8 @@ Import("arm_compute_test_framework")
 test_env.Append(LIBS = arm_compute_test_framework)
 
 # Disable floating-point expression contraction (e.g. fused multiply-add operations)
-test_env.Append(CXXFLAGS = ['-ffp-contract=off'])
+if not 'windows' in env['os']:
+    test_env.Append(CXXFLAGS = ['-ffp-contract=off'])
 
 # Remove -Wnoexcept from tests
 if 'g++' in test_env['CXX'] and '-Wnoexcept' in test_env['CXXFLAGS']:
@@ -77,18 +79,19 @@ load_whole_archive = '-Wl,--whole-archive'
 noload_whole_archive = '-Wl,--no-whole-archive'
 if 'macos' in test_env['os']:
     load_whole_archive = '-Wl,-force_load'
-    noload_whole_archive = '-Wl,-noall_load'
+    noload_whole_archive = ''
+
+if (env['multi_isa']):
+     test_env.Append(CPPDEFINES=['ARM_COMPUTE_ENABLE_BF16'])
 
 if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']:
     Import("arm_compute_a")
-    Import("arm_compute_core_a")
     Import("arm_compute_graph_a")
-    test_env.Append(LIBS = [arm_compute_graph_a, arm_compute_a, arm_compute_core_a])
+    test_env.Append(LIBS = [arm_compute_graph_a, arm_compute_a])
     arm_compute_lib = arm_compute_graph_a
 else:
     Import("arm_compute_graph_so")
-    Import("arm_compute_core_a")
-    test_env.Append(LIBS = ["arm_compute_graph", "arm_compute", "arm_compute_core"])
+    test_env.Append(LIBS = ["arm_compute_graph", "arm_compute"])
     arm_compute_lib = arm_compute_graph_so
 
 if env['os'] in ['bare_metal']:
@@ -114,6 +117,10 @@ filter_pattern = test_env['test_filter']
 files_validation += Glob('validation/CPP/' + filter_pattern)
 
 if env['opencl']:
+    if env['experimental_dynamic_fusion']:
+        files_validation += Glob('validation/dynamic_fusion/gpu/' + filter_pattern)
+        files_validation += Glob('validation/dynamic_fusion/gpu/cl/' + filter_pattern)
+
     filter_pattern = test_env['test_filter']
 
     test_env.Append(CPPDEFINES=['ARM_COMPUTE_CL'])
@@ -125,6 +132,7 @@ if env['opencl']:
 
     files_validation += Glob('validation/CL/*/' + filter_pattern)
     files_validation += Glob('validation/CL/' + filter_pattern)
+
     if env['external_tests_dir']:
         files_validation += Glob(env['external_tests_dir'] + '/tests/validation/CL/' + filter_pattern)
     files_validation += Glob('validation/gpu/unit/*.cpp')
@@ -133,6 +141,7 @@ if env['neon']:
     filter_pattern = test_env['test_filter']
     files_benchmark += Glob('benchmark/NEON/*/' + filter_pattern)
     files_benchmark += Glob('benchmark/NEON/' + filter_pattern)
+    test_env.Append(CPPPATH = ["#/src/cpu/kernels/assembly/"])
     if env['external_tests_dir']:
         files_benchmark += Glob(env['external_tests_dir'] + '/tests/benchmark/NEON/' + filter_pattern)
 
@@ -150,7 +159,7 @@ if env['neon']:
 extra_link_flags = []
 if env['os'] == 'android':
     test_env.Append(LIBS = ["log"])
-elif env['os'] not in ['bare_metal', 'macos']:
+elif env['os'] not in ['windows','bare_metal', 'macos']:
     test_env.Append(LIBS = ["rt"])
     extra_link_flags += ['-fstack-protector-strong']
 
@@ -166,14 +175,20 @@ bm_link_flags = []
 if test_env['linker_script']:
     bm_link_flags += ['-Wl,--build-id=none', '-T', env['linker_script']]
 
-if test_env['reference_openmp'] and env['os'] not in ['bare_metal', 'macos']:
-   test_env['CXXFLAGS'].append('-fopenmp')
-   test_env['LINKFLAGS'].append('-fopenmp')
+if test_env['reference_openmp'] and env['os'] not in ['bare_metal', 'macos','windows']:
+    test_env['CXXFLAGS'].append('-fopenmp')
+    test_env['LINKFLAGS'].append('-fopenmp')
+
+    if 'ndk_above_r21' in env:
+        test_env['LINKFLAGS'].append('-static-openmp')
+
+# Testing for fixed format GEMM kernels.
+if env['fixed_format_kernels'] and test_env['validation_tests']:
+    test_env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS'])
 
 if test_env['validation_tests']:
-    arm_compute_validation_framework = env.StaticLibrary('arm_compute_validation_framework', Glob('validation/reference/*.cpp') + Glob('validation/*.cpp'), LINKFLAGS=test_env['LINKFLAGS'], CXXFLAGS=test_env['CXXFLAGS'], LIBS= [ arm_compute_test_framework, arm_compute_core_a])
+    arm_compute_validation_framework = env.StaticLibrary('arm_compute_validation_framework', Glob('validation/reference/*.cpp') + Glob('validation/*.cpp'), LINKFLAGS=test_env['LINKFLAGS'], CXXFLAGS=test_env['CXXFLAGS'], LIBS= [ arm_compute_test_framework ])
     Depends(arm_compute_validation_framework , arm_compute_test_framework)
-    Depends(arm_compute_validation_framework , arm_compute_core_a)
 
     program_objects = files_validation + common_objects
     if test_env['os'] == 'bare_metal':
@@ -272,8 +287,9 @@ if test_env['benchmark_examples']:
                 #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
                 prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
                 arm_compute_benchmark_examples += [ prog ]
+
     arm_compute_benchmark_examples = install_bin(arm_compute_benchmark_examples)
     Depends(arm_compute_benchmark_examples, arm_compute_test_framework)
     Depends(arm_compute_benchmark_examples, arm_compute_lib)
     Default(arm_compute_benchmark_examples)
-    Export('arm_compute_benchmark_examples')
-\ No newline at end of file
+    Export('arm_compute_benchmark_examples')
diff --git a/tests/SimpleTensor.h b/tests/SimpleTensor.h
index c1bd7f87b5..419621e808 100644
--- a/tests/SimpleTensor.h
+++ b/tests/SimpleTensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,6 +173,15 @@ public:
      */
     QuantizationInfo quantization_info() const override;
 
+    /** Set the quantization information of the tensor.
+     *
+     * This function does not have any effect on the raw quantized data of the tensor.
+     * It simply changes the quantization information, hence changes the dequantized values.
+     *
+     * @return A reference to the current object.
+     */
+    SimpleTensor<T> &quantization_info(const QuantizationInfo &qinfo);
+
     /** Constant pointer to the underlying buffer.
      *
      * @return a constant pointer to the data.
@@ -335,6 +344,13 @@ QuantizationInfo SimpleTensor<T>::quantization_info() const
 }
 
 template <typename T>
+SimpleTensor<T> &SimpleTensor<T>::quantization_info(const QuantizationInfo &qinfo)
+{
+    _quantization_info = qinfo;
+    return *this;
+}
+
+template <typename T>
 size_t SimpleTensor<T>::size() const
 {
     const size_t size = std::accumulate(_shape.cbegin(), _shape.cend(), 1, std::multiplies<size_t>());
@@ -376,6 +392,8 @@ int SimpleTensor<T>::num_channels() const
         case Format::S16:
         case Format::U32:
         case Format::S32:
+        case Format::U64:
+        case Format::S64:
         case Format::F16:
         case Format::F32:
             return 1;
diff --git a/tests/SimpleTensorPrinter.h b/tests/SimpleTensorPrinter.h
index 6c1506b40d..e4ca66bb36 100644
--- a/tests/SimpleTensorPrinter.h
+++ b/tests/SimpleTensorPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,9 @@
  * SOFTWARE.
  */
 
+#ifndef ARM_COMPUTE_TEST_SIMPLE_TENSOR_PRINTER
+#define ARM_COMPUTE_TEST_SIMPLE_TENSOR_PRINTER
+
 #include "arm_compute/core/Error.h"
 
 #include "tests/RawTensor.h"
@@ -34,8 +37,6 @@ namespace arm_compute
 {
 namespace test
 {
-namespace
-{
 template <typename T>
 inline std::string prettify_tensor(const SimpleTensor<T> &input, const IOFormatInfo &io_fmt = IOFormatInfo{ IOFormatInfo::PrintRegion::NoPadding })
 {
@@ -152,6 +153,6 @@ void print_simpletensor(const SimpleTensor<T> &tensor, const std::string &title,
     }
 }
 #endif // PRINT_TENSOR_LIMIT
-}
 } // namespace test
 } // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_SIMPLE_TENSOR_PRINTER */
diff --git a/tests/TypePrinter.h b/tests/TypePrinter.h
deleted file mode 100644
index 612360d4f0..0000000000
--- a/tests/TypePrinter.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_TYPE_PRINTER_H
-#define ARM_COMPUTE_TEST_TYPE_PRINTER_H
-
-#include "tests/Types.h"
-
-namespace arm_compute
-{
-/** Formatted output of the GradientDimension type.
- *
- * @param[out] os  Output stream
- * @param[in]  dim Type to output
- *
- * @return Modified output stream.
- */
-inline ::std::ostream &operator<<(::std::ostream &os, const GradientDimension &dim)
-{
-    switch(dim)
-    {
-        case GradientDimension::GRAD_X:
-            os << "GRAD_X";
-            break;
-        case GradientDimension::GRAD_Y:
-            os << "GRAD_Y";
-            break;
-        case GradientDimension::GRAD_XY:
-            os << "GRAD_XY";
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    return os;
-}
-
-/** Formatted output of the GradientDimension type.
- *
- * @param[in] type Type to output
- *
- * @return Formatted string.
- */
-inline std::string to_string(const arm_compute::GradientDimension &type)
-{
-    std::stringstream str;
-    str << type;
-    return str.str();
-}
-
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_TYPE_PRINTER_H */
diff --git a/tests/Types.h b/tests/Types.h
index c8e9a755a7..5d5a8207d5 100644
--- a/tests/Types.h
+++ b/tests/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,14 +30,6 @@
 
 namespace arm_compute
 {
-/** Gradient dimension type. */
-enum class GradientDimension
-{
-    GRAD_X,  /**< x gradient dimension */
-    GRAD_Y,  /**< y gradient dimension */
-    GRAD_XY, /**< x and y gradient dimension */
-};
-
 /** Min and max values and locations */
 template <typename MinMaxType>
 struct MinMaxLocationValues
@@ -47,37 +39,5 @@ struct MinMaxLocationValues
     std::vector<Coordinates2D> min_loc{}; /**< Min value location */
     std::vector<Coordinates2D> max_loc{}; /**< Max value location */
 };
-
-/** Parameters of Optical Flow algorithm. */
-struct OpticalFlowParameters
-{
-    OpticalFlowParameters(Termination termination,
-                          float       epsilon,
-                          size_t      num_iterations,
-                          size_t      window_dimension,
-                          bool        use_initial_estimate)
-        : termination{ std::move(termination) },
-          epsilon{ std::move(epsilon) },
-          num_iterations{ std::move(num_iterations) },
-          window_dimension{ std::move(window_dimension) },
-          use_initial_estimate{ std::move(use_initial_estimate) }
-    {
-    }
-
-    Termination termination;
-    float       epsilon;
-    size_t      num_iterations;
-    size_t      window_dimension;
-    bool        use_initial_estimate;
-};
-
-/** Internal keypoint class for Lucas-Kanade Optical Flow */
-struct InternalKeyPoint
-{
-    float x{ 0.f };                 /**< x coordinate of the keypoint */
-    float y{ 0.f };                 /**< y coordinate of the keypoint */
-    bool  tracking_status{ false }; /**< the tracking status of the keypoint */
-};
-
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_TEST_TYPES_H */
diff --git a/tests/Utils.h b/tests/Utils.h
index 2569c41a9e..7b831468d3 100644
--- a/tests/Utils.h
+++ b/tests/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,6 @@
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/PyramidInfo.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
@@ -50,6 +48,7 @@
 #include <type_traits>
 #include <vector>
 
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
 #include "arm_compute/runtime/RuntimeContext.h"
 
@@ -135,7 +134,7 @@ using make_unsigned_conditional_t = typename std::conditional<std::is_integral<T
 
 // clang-format on
 // *INDENT-ON*
-}
+} // namespace traits
 
 /** Look up the format corresponding to a channel.
  *
@@ -241,101 +240,6 @@ inline ValidRegion shape_to_valid_region(const TensorShape &a_shape, bool border
     return valid_region;
 }
 
-/** Create a valid region for Gaussian Pyramid Half based on tensor shape and valid region at level "i - 1" and border mode
- *
- * @note The border size is 2 in case of Gaussian Pyramid Half
- *
- * @param[in] a_shape          Shape used at level "i - 1" of Gaussian Pyramid Half
- * @param[in] a_valid_region   Valid region used at level "i - 1" of Gaussian Pyramid Half
- * @param[in] border_undefined (Optional) Boolean indicating if the border mode is undefined.
- *
- *  return The valid region for the level "i" of Gaussian Pyramid Half
- */
-inline ValidRegion shape_to_valid_region_gaussian_pyramid_half(const TensorShape &a_shape, const ValidRegion &a_valid_region, bool border_undefined = false)
-{
-    constexpr int border_size = 2;
-
-    ValidRegion valid_region{ Coordinates(), a_shape };
-
-    Coordinates &anchor = valid_region.anchor;
-    TensorShape &shape  = valid_region.shape;
-
-    // Compute tensor shape for level "i" of Gaussian Pyramid Half
-    // dst_width  = (src_width + 1) * 0.5f
-    // dst_height = (src_height + 1) * 0.5f
-    shape.set(0, (a_shape[0] + 1) * 0.5f);
-    shape.set(1, (a_shape[1] + 1) * 0.5f);
-
-    if(border_undefined)
-    {
-        ARM_COMPUTE_ERROR_ON(shape.num_dimensions() < 2);
-
-        // Compute the left and top invalid borders
-        float invalid_border_left = static_cast<float>(a_valid_region.anchor.x() + border_size) / 2.0f;
-        float invalid_border_top  = static_cast<float>(a_valid_region.anchor.y() + border_size) / 2.0f;
-
-        // For the new anchor point we can have 2 cases:
-        // 1) If the width/height of the tensor shape is odd, we have to take the ceil value of (a_valid_region.anchor.x() + border_size) / 2.0f or (a_valid_region.anchor.y() + border_size / 2.0f
-        // 2) If the width/height of the tensor shape is even, we have to take the floor value of (a_valid_region.anchor.x() + border_size) / 2.0f or (a_valid_region.anchor.y() + border_size) / 2.0f
-        // In this manner we should be able to propagate correctly the valid region along all levels of the pyramid
-        invalid_border_left = (a_shape[0] % 2) ? std::ceil(invalid_border_left) : std::floor(invalid_border_left);
-        invalid_border_top  = (a_shape[1] % 2) ? std::ceil(invalid_border_top) : std::floor(invalid_border_top);
-
-        // Set the anchor point
-        anchor.set(0, static_cast<int>(invalid_border_left));
-        anchor.set(1, static_cast<int>(invalid_border_top));
-
-        // Compute shape
-        // Calculate the right and bottom invalid borders at the previous level of the pyramid
-        const float prev_invalid_border_right  = static_cast<float>(a_shape[0] - (a_valid_region.anchor.x() + a_valid_region.shape[0]));
-        const float prev_invalid_border_bottom = static_cast<float>(a_shape[1] - (a_valid_region.anchor.y() + a_valid_region.shape[1]));
-
-        // Calculate the right and bottom invalid borders at the current level of the pyramid
-        const float invalid_border_right  = std::ceil((prev_invalid_border_right + static_cast<float>(border_size)) / 2.0f);
-        const float invalid_border_bottom = std::ceil((prev_invalid_border_bottom + static_cast<float>(border_size)) / 2.0f);
-
-        const int valid_shape_x = std::max(0, static_cast<int>(shape.x()) - static_cast<int>(invalid_border_left) - static_cast<int>(invalid_border_right));
-        const int valid_shape_y = std::max(0, static_cast<int>(shape.y()) - static_cast<int>(invalid_border_top) - static_cast<int>(invalid_border_bottom));
-
-        shape.set(0, valid_shape_x);
-        shape.set(1, valid_shape_y);
-    }
-
-    return valid_region;
-}
-
-/** Create a valid region for Laplacian Pyramid based on tensor shape and valid region at level "i - 1" and border mode
- *
- * @note The border size is 2 in case of Laplacian Pyramid
- *
- * @param[in] a_shape          Shape used at level "i - 1" of Laplacian Pyramid
- * @param[in] a_valid_region   Valid region used at level "i - 1" of Laplacian Pyramid
- * @param[in] border_undefined (Optional) Boolean indicating if the border mode is undefined.
- *
- *  return The valid region for the level "i" of Laplacian Pyramid
- */
-inline ValidRegion shape_to_valid_region_laplacian_pyramid(const TensorShape &a_shape, const ValidRegion &a_valid_region, bool border_undefined = false)
-{
-    ValidRegion valid_region = shape_to_valid_region_gaussian_pyramid_half(a_shape, a_valid_region, border_undefined);
-
-    if(border_undefined)
-    {
-        const BorderSize gaussian5x5_border(2);
-
-        auto border_left   = static_cast<int>(gaussian5x5_border.left);
-        auto border_right  = static_cast<int>(gaussian5x5_border.right);
-        auto border_top    = static_cast<int>(gaussian5x5_border.top);
-        auto border_bottom = static_cast<int>(gaussian5x5_border.bottom);
-
-        valid_region.anchor.set(0, valid_region.anchor[0] + border_left);
-        valid_region.anchor.set(1, valid_region.anchor[1] + border_top);
-        valid_region.shape.set(0, std::max(0, static_cast<int>(valid_region.shape[0]) - border_right - border_left));
-        valid_region.shape.set(1, std::max(0, static_cast<int>(valid_region.shape[1]) - border_top - border_bottom));
-    }
-
-    return valid_region;
-}
-
 /** Write the value after casting the pointer according to @p data_type.
  *
  * @warning The type of the value must match the specified data type.
@@ -566,110 +470,6 @@ inline T create_tensor(const TensorShape &shape, Format format, IRuntimeContext
     return create_tensor<T>(info, ctx);
 }
 
-/** Create and initialize a multi-image of the given type.
- *
- * @param[in] shape  Tensor shape.
- * @param[in] format Format type.
- *
- * @return Initialized tensor of given type.
- */
-template <typename T>
-inline T create_multi_image(const TensorShape &shape, Format format)
-{
-    T multi_image;
-    multi_image.init(shape.x(), shape.y(), format);
-
-    return multi_image;
-}
-
-/** Create and initialize a HOG (Histogram of Oriented Gradients) of the given type.
- *
- * @param[in] hog_info HOGInfo object
- *
- * @return Initialized HOG of given type.
- */
-template <typename T>
-inline T create_HOG(const HOGInfo &hog_info)
-{
-    T hog;
-    hog.init(hog_info);
-
-    return hog;
-}
-
-/** Create and initialize a Pyramid of the given type.
- *
- * @param[in] pyramid_info The PyramidInfo object.
- *
- * @return Initialized Pyramid of given type.
- */
-template <typename T>
-inline T create_pyramid(const PyramidInfo &pyramid_info)
-{
-    T pyramid;
-    pyramid.init_auto_padding(pyramid_info);
-
-    return pyramid;
-}
-
-/** Initialize a convolution matrix.
- *
- * @param[in, out] conv   The input convolution matrix.
- * @param[in]      width  The width of the convolution matrix.
- * @param[in]      height The height of the convolution matrix.
- * @param[in]      seed   The random seed to be used.
- */
-inline void init_conv(int16_t *conv, unsigned int width, unsigned int height, std::random_device::result_type seed)
-{
-    std::mt19937                           gen(seed);
-    std::uniform_int_distribution<int16_t> distribution_int16(-32768, 32767);
-
-    for(unsigned int i = 0; i < width * height; ++i)
-    {
-        conv[i] = distribution_int16(gen);
-    }
-}
-
-/** Initialize a separable convolution matrix.
- *
- * @param[in, out] conv   The input convolution matrix.
- * @param[in]      width  The width of the convolution matrix.
- * @param[in]      height The height of the convolution matrix.
- * @param[in]      seed   The random seed to be used.
- */
-inline void init_separable_conv(int16_t *conv, unsigned int width, unsigned int height, std::random_device::result_type seed)
-{
-    std::mt19937 gen(seed);
-    // Set it between -128 and 127 to ensure the matrix does not overflow
-    std::uniform_int_distribution<int16_t> distribution_int16(-128, 127);
-
-    int16_t *conv_row = new int16_t[width];
-    int16_t *conv_col = new int16_t[height];
-
-    conv_row[0] = conv_col[0] = 1;
-    for(unsigned int i = 1; i < width; ++i)
-    {
-        conv_row[i] = distribution_int16(gen);
-    }
-
-    for(unsigned int i = 1; i < height; ++i)
-    {
-        conv_col[i] = distribution_int16(gen);
-    }
-
-    // Multiply two matrices
-    for(unsigned int i = 0; i < width; ++i)
-    {
-        for(unsigned int j = 0; j < height; ++j)
-        {
-            conv[i * width + j] = conv_col[i] * conv_row[j];
-        }
-    }
-
-    delete[] conv_row;
-    delete[] conv_col;
-}
-
 /** Create a vector with a uniform distribution of floating point values across the specified range.
  *
  * @param[in] num_values The number of values to be created.
@@ -694,44 +494,6 @@ inline std::vector<T> generate_random_real(unsigned int num_values, T min, T max
     return v;
 }
 
-/** Create a vector of random keypoints for pyramid representation.
- *
- * @param[in] shape         The shape of the input tensor.
- * @param[in] num_keypoints The number of keypoints to be created.
- * @param[in] seed          The random seed to be used.
- * @param[in] num_levels    The number of pyramid levels.
- *
- * @return A vector that contains the requested number of random keypoints
- */
-inline std::vector<KeyPoint> generate_random_keypoints(const TensorShape &shape, size_t num_keypoints, std::random_device::result_type seed, size_t num_levels = 1)
-{
-    std::vector<KeyPoint> keypoints;
-    std::mt19937          gen(seed);
-
-    // Calculate distribution bounds
-    const auto min        = static_cast<int>(std::pow(2, num_levels));
-    const auto max_width  = static_cast<int>(shape.x());
-    const auto max_height = static_cast<int>(shape.y());
-
-    ARM_COMPUTE_ERROR_ON(min > max_width || min > max_height);
-
-    // Create distributions
-    std::uniform_int_distribution<> dist_w(min, max_width);
-    std::uniform_int_distribution<> dist_h(min, max_height);
-
-    for(unsigned int i = 0; i < num_keypoints; i++)
-    {
-        KeyPoint keypoint;
-        keypoint.x               = dist_w(gen);
-        keypoint.y               = dist_h(gen);
-        keypoint.tracking_status = 1;
-
-        keypoints.push_back(keypoint);
-    }
-
-    return keypoints;
-}
-
 template <typename T, typename ArrayAccessor_T>
 inline void fill_array(ArrayAccessor_T &&array, const std::vector<T> &v)
 {
@@ -814,6 +576,67 @@ inline void sync_tensor_if_necessary(TensorType &tensor)
 {
     ARM_COMPUTE_UNUSED(tensor);
 }
+
+/** Construct and return object for dimensions' state filled with the given value
+ *
+ * @param[in] value The value to fill
+ *
+ * @return Constructed class
+ */
+inline ITensorInfo::TensorDimsState construct_dims_state(int32_t value)
+{
+    auto states = ITensorInfo::TensorDimsState{};
+    std::fill(states.begin(), states.end(), value);
+    return states;
+}
+
+/** Construct and return object for dimensions' state filled with the value for dynamic state
+ *
+ * @return Constructed class filled with the value for dynamic state
+ */
+inline ITensorInfo::TensorDimsState construct_dynamic_dims_state()
+{
+    return construct_dims_state(ITensorInfo::get_dynamic_state_value());
+}
+
+/** Construct and return object for dimensions' state filled with the value for non-dynamic state
+ *
+ * @return Constructed class filled with the value for non-dynamic state
+ */
+inline ITensorInfo::TensorDimsState construct_static_dims_state()
+{
+    return construct_dims_state(ITensorInfo::get_static_state_value());
+}
+
+/** Set the dimension states of the given tensor to dynamic
+ *
+ * @param[in] t The tensor to set to dynamic state
+ *
+ */
+template <typename TensorType>
+void set_tensor_dynamic(TensorType &t)
+{
+    t.info()->set_tensor_dims_state(construct_dynamic_dims_state());
+}
+
+/** Set the dimension states of the given tensor to state
+ *
+ * @param[in] t The tensor to set to static state
+ *
+ */
+template <typename TensorType>
+void set_tensor_static(TensorType &t)
+{
+    t.info()->set_tensor_dims_state(construct_static_dims_state());
+}
+
+inline experimental::dynamic_fusion::Conv2dAttributes convert_pad_stride_info_to_conv_attr(const PadStrideInfo &info, const Size2D &dialation)
+{
+    const Padding2D info_pad(info.pad_left(), info.pad_right(), info.pad_top(), info.pad_bottom());
+    const Size2D    info_stride(info.stride().first, info.stride().second);
+    return arm_compute::experimental::dynamic_fusion::Conv2dAttributes().pad(info_pad).stride(info_stride).dilation(dialation);
+}
+
 } // namespace test
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_TEST_UTILS_H */
diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000..a3a352681e
--- /dev/null
+++ b/tests/benchmark/CMakeLists.txt
@@ -0,0 +1,23 @@
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+target_sources(arm_compute_benchmark PRIVATE NEON/Scale.cpp)
diff --git a/tests/benchmark/fixtures/ScaleFixture.h b/tests/benchmark/fixtures/ScaleFixture.h
index 953872ea64..81f34bd538 100644
--- a/tests/benchmark/fixtures/ScaleFixture.h
+++ b/tests/benchmark/fixtures/ScaleFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,6 @@ template <typename TensorType, typename Function, typename Accessor>
 class ScaleFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, DataLayout data_layout, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
     {
         constexpr float max_width  = 8192.0f;
diff --git a/tests/benchmark/fixtures/ScaleLayerFixture.h b/tests/benchmark/fixtures/ScaleLayerFixture.h
index a6a798f5b1..59cc12c77d 100644
--- a/tests/benchmark/fixtures/ScaleLayerFixture.h
+++ b/tests/benchmark/fixtures/ScaleLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,6 @@ template <typename TensorType, typename Function, typename Accessor, typename T>
 class ScaleLayerFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy, float sx, float sy, DataType data_type)
     {
         constexpr float max_width  = 8192.0f;
@@ -63,8 +62,8 @@ public:
 
         scale_layer.configure(&src, &dst, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy });
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
diff --git a/tests/benchmark_examples/RunExample.cpp b/tests/benchmark_examples/RunExample.cpp
index e5dfe74d0e..3e56ea2e64 100644
--- a/tests/benchmark_examples/RunExample.cpp
+++ b/tests/benchmark_examples/RunExample.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #define BENCHMARK_EXAMPLES
 #include "utils/Utils.cpp"
 
+#include "arm_compute/core/Version.h"
 #include "arm_compute/runtime/Scheduler.h"
 #include "tests/framework/Framework.h"
 #include "tests/framework/Macros.h"
@@ -128,7 +129,17 @@ int run_example(int argc, char **argv, std::unique_ptr<Example> example)
     CLGEMMHeuristicsHandle gemm_h;
     if(opencl_is_available())
     {
-        auto ctx_dev_err = create_opencl_context_and_device();
+        CLBackendType backend_type = CLBackendType::Native;
+        for(auto &arg : example_args->value())
+        {
+            if(arg.find("--target=clvk") != std::string::npos)
+            {
+                backend_type = CLBackendType::Clvk;
+                break;
+            }
+        }
+
+        auto ctx_dev_err = create_opencl_context_and_device(backend_type);
         ARM_COMPUTE_ERROR_ON_MSG(std::get<2>(ctx_dev_err) != CL_SUCCESS, "Failed to create OpenCL context");
         CLScheduler::get()
         .default_init_with_context(std::get<1>(ctx_dev_err), std::get<0>(ctx_dev_err), nullptr, &gemm_h);
diff --git a/tests/datasets/ActivationFunctionsDataset.h b/tests/datasets/ActivationFunctionsDataset.h
index 1f3313c476..9b0d775376 100644
--- a/tests/datasets/ActivationFunctionsDataset.h
+++ b/tests/datasets/ActivationFunctionsDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,10 @@ public:
                             ActivationLayerInfo::ActivationFunction::SQRT,
                             ActivationLayerInfo::ActivationFunction::SQUARE,
                             ActivationLayerInfo::ActivationFunction::TANH,
-                            ActivationLayerInfo::ActivationFunction::IDENTITY
+                            ActivationLayerInfo::ActivationFunction::IDENTITY,
+#ifdef __aarch64__
+                            ActivationLayerInfo::ActivationFunction::GELU,
+#endif /* __aarch64__ */
     })
     {
     }
diff --git a/tests/datasets/BatchToSpaceDataset.h b/tests/datasets/BatchToSpaceDataset.h
index 1edd457aad..2670af50df 100644
--- a/tests/datasets/BatchToSpaceDataset.h
+++ b/tests/datasets/BatchToSpaceDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,15 +38,17 @@ namespace datasets
 class BatchToSpaceLayerDataset
 {
 public:
-    using type = std::tuple<TensorShape, TensorShape, TensorShape>;
+    using type = std::tuple<TensorShape, std::vector<int32_t>, CropInfo, TensorShape>;
 
     struct iterator
     {
-        iterator(std::vector<TensorShape>::const_iterator src_it,
-                 std::vector<TensorShape>::const_iterator block_shape_it,
-                 std::vector<TensorShape>::const_iterator dst_it)
+        iterator(std::vector<TensorShape>::const_iterator          src_it,
+                 std::vector<std::vector<int32_t>>::const_iterator block_shape_it,
+                 std::vector<CropInfo>::const_iterator             crop_info_it,
+                 std::vector<TensorShape>::const_iterator          dst_it)
             : _src_it{ std::move(src_it) },
               _block_shape_it{ std::move(block_shape_it) },
+              _crop_info_it{ std::move(crop_info_it) },
               _dst_it{ std::move(dst_it) }
         {
         }
@@ -56,44 +58,48 @@ public:
             std::stringstream description;
             description << "In=" << *_src_it << ":";
             description << "BlockShape=" << *_block_shape_it << ":";
+            description << "CropInfo=" << *_crop_info_it << ":";
             description << "Out=" << *_dst_it;
             return description.str();
         }
 
         BatchToSpaceLayerDataset::type operator*() const
         {
-            return std::make_tuple(*_src_it, *_block_shape_it, *_dst_it);
+            return std::make_tuple(*_src_it, *_block_shape_it, *_crop_info_it, *_dst_it);
         }
 
         iterator &operator++()
         {
             ++_src_it;
             ++_block_shape_it;
+            ++_crop_info_it;
             ++_dst_it;
 
             return *this;
         }
 
     private:
-        std::vector<TensorShape>::const_iterator _src_it;
-        std::vector<TensorShape>::const_iterator _block_shape_it;
-        std::vector<TensorShape>::const_iterator _dst_it;
+        std::vector<TensorShape>::const_iterator          _src_it;
+        std::vector<std::vector<int32_t>>::const_iterator _block_shape_it;
+        std::vector<CropInfo>::const_iterator             _crop_info_it;
+        std::vector<TensorShape>::const_iterator          _dst_it;
     };
 
     iterator begin() const
     {
-        return iterator(_src_shapes.begin(), _block_shape_shapes.begin(), _dst_shapes.begin());
+        return iterator(_src_shapes.begin(), _block_shapes.begin(), _crop_infos.begin(), _dst_shapes.begin());
     }
 
     int size() const
     {
-        return std::min(_src_shapes.size(), std::min(_block_shape_shapes.size(), _dst_shapes.size()));
+        return std::min(std::min(std::min(_src_shapes.size(), _block_shapes.size()), _crop_infos.size()), _dst_shapes.size());
     }
 
-    void add_config(TensorShape src, TensorShape block_shape, TensorShape dst)
+    void add_config(const TensorShape &src, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, const TensorShape &dst)
     {
         _src_shapes.emplace_back(std::move(src));
-        _block_shape_shapes.emplace_back(std::move(block_shape));
+        _block_shapes.emplace_back(std::move(block_shape));
+        _crop_infos.emplace_back(std::move(crop_info));
         _dst_shapes.emplace_back(std::move(dst));
     }
 
@@ -102,35 +108,60 @@ protected:
     BatchToSpaceLayerDataset(BatchToSpaceLayerDataset &&) = default;
 
 private:
-    std::vector<TensorShape> _src_shapes{};
-    std::vector<TensorShape> _block_shape_shapes{};
-    std::vector<TensorShape> _dst_shapes{};
+    std::vector<TensorShape>          _src_shapes{};
+    std::vector<std::vector<int32_t>> _block_shapes{};
+    std::vector<CropInfo>             _crop_infos{};
+    std::vector<TensorShape>          _dst_shapes{};
 };
 
+/** Follow NCHW data layout across all datasets. I.e.
+ * TensorShape(Width(X), Height(Y), Channel(Z), Batch(W))
+ */
+
 class SmallBatchToSpaceLayerDataset final : public BatchToSpaceLayerDataset
 {
 public:
     SmallBatchToSpaceLayerDataset()
     {
-        add_config(TensorShape(1U, 1U, 1U, 4U), TensorShape(2U), TensorShape(2U, 2U, 1U, 1U));
-        add_config(TensorShape(3U, 1U, 1U, 4U), TensorShape(2U), TensorShape(6U, 2U, 1U, 1U));
-        add_config(TensorShape(1U, 2U, 2U, 4U), TensorShape(2U), TensorShape(2U, 4U, 2U, 1U));
-        add_config(TensorShape(1U, 3U, 1U, 8U), TensorShape(2U), TensorShape(2U, 6U, 1U, 2U));
-        add_config(TensorShape(3U, 4U, 1U, 4U), TensorShape(2U), TensorShape(6U, 8U, 1U, 1U));
-        add_config(TensorShape(1U, 1U, 1U, 8U), TensorShape(4U, 2U), TensorShape(4U, 2U, 1U, 1U));
-        add_config(TensorShape(3U, 1U, 1U, 8U), TensorShape(2U, 4U), TensorShape(6U, 4U, 1U, 1U));
+        // Block size = 1 (effectively no batch to space)
+        add_config(TensorShape(1U, 1U, 1U, 4U), { 1U, 1U }, CropInfo(), TensorShape(1U, 1U, 1U, 4U));
+        add_config(TensorShape(8U, 2U, 4U, 3U), { 1U, 1U }, CropInfo(), TensorShape(8U, 2U, 4U, 3U));
+        // Same block size in both x and y
+        add_config(TensorShape(3U, 2U, 1U, 4U), { 2U, 2U }, CropInfo(), TensorShape(6U, 4U, 1U, 1U));
+        add_config(TensorShape(1U, 3U, 2U, 9U), { 3U, 3U }, CropInfo(), TensorShape(3U, 9U, 2U, 1U));
+        // Different block size in x and y
+        add_config(TensorShape(5U, 7U, 7U, 4U), { 2U, 1U }, CropInfo(), TensorShape(10U, 7U, 7U, 2U));
+        add_config(TensorShape(3U, 3U, 1U, 8U), { 1U, 2U }, CropInfo(), TensorShape(3U, 6U, 1U, 4U));
+        add_config(TensorShape(5U, 2U, 2U, 6U), { 3U, 2U }, CropInfo(), TensorShape(15U, 4U, 2U, 1U));
     }
 };
 
+/** Relative small shapes that are still large enough to leave room for testing cropping of the output shape
+ */
+class SmallBatchToSpaceLayerWithCroppingDataset final : public BatchToSpaceLayerDataset
+{
+public:
+    SmallBatchToSpaceLayerWithCroppingDataset()
+    {
+        // Crop in both dims
+        add_config(TensorShape(5U, 3U, 2U, 8U), { 2U, 2U }, CropInfo(1U, 1U, 2U, 1U), TensorShape(8U, 3U, 2U, 2U));
+        // Left crop in x dim
+        add_config(TensorShape(1U, 1U, 1U, 20U), { 4U, 5U }, CropInfo(2U, 1U, 0U, 2U), TensorShape(1U, 3U, 1U, 1U));
+        // Left crop in y dim
+        add_config(TensorShape(3U, 1U, 1U, 8U), { 2U, 4U }, CropInfo(0U, 0U, 2U, 1U), TensorShape(6U, 1U, 1U, 1U));
+    }
+};
 class LargeBatchToSpaceLayerDataset final : public BatchToSpaceLayerDataset
 {
 public:
     LargeBatchToSpaceLayerDataset()
     {
-        add_config(TensorShape(64U, 32U, 2U, 4U), TensorShape(2U), TensorShape(128U, 64U, 2U, 1U));
-        add_config(TensorShape(128U, 16U, 2U, 16U), TensorShape(2U), TensorShape(512U, 64U, 2U, 1U));
-        add_config(TensorShape(16U, 8U, 2U, 8U), TensorShape(4U, 2U), TensorShape(64U, 16U, 2U, 1U));
-        add_config(TensorShape(8U, 16U, 2U, 8U), TensorShape(2U, 4U), TensorShape(16U, 64U, 2U, 1U));
+        // Same block size in both x and y
+        add_config(TensorShape(64U, 32U, 2U, 4U), { 2U, 2U }, CropInfo(), TensorShape(128U, 64U, 2U, 1U));
+        add_config(TensorShape(128U, 16U, 2U, 18U), { 3U, 3U }, CropInfo(), TensorShape(384U, 48U, 2U, 2U));
+        // Different block size in x and y
+        add_config(TensorShape(16U, 8U, 2U, 8U), { 4U, 1U }, CropInfo(), TensorShape(64U, 8U, 2U, 2U));
+        add_config(TensorShape(8U, 16U, 2U, 8U), { 2U, 4U }, CropInfo(), TensorShape(16U, 64U, 2U, 1U));
     }
 };
 } // namespace datasets
diff --git a/tests/datasets/ChannelShuffleLayerDataset.h b/tests/datasets/ChannelShuffleLayerDataset.h
index afab893234..a851480fa1 100644
--- a/tests/datasets/ChannelShuffleLayerDataset.h
+++ b/tests/datasets/ChannelShuffleLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,6 +105,7 @@ class SmallRandomChannelShuffleLayerDataset final : public ChannelShuffleLayerDa
 public:
     SmallRandomChannelShuffleLayerDataset()
     {
+        add_config(TensorShape(1U, 1U, 605U, 16U), 5);
         add_config(TensorShape(15U, 16U, 4U, 12U), 2);
         add_config(TensorShape(21U, 11U, 12U, 7U), 4);
         add_config(TensorShape(21U, 11U, 12U, 7U), 6);
diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index 86804fb4c6..17e03368ac 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_DEPTHWISE_CONVOLUTION_DATASET
-#define ARM_COMPUTE_TEST_DEPTHWISE_CONVOLUTION_DATASET
+#ifndef ACL_TESTS_DATASETS_DEPTHWISECONVOLUTIONLAYERDATASET_H
+#define ACL_TESTS_DATASETS_DEPTHWISECONVOLUTIONLAYERDATASET_H
 
 #include "utils/TypePrinter.h"
 
@@ -121,13 +121,13 @@ public:
     SmallDepthwiseConvolutionLayerDataset()
     {
         add_config(TensorShape(7U, 7U, 1U), Size2D(1U, 1U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(23U, 27U, 5U), Size2D(3U, 5U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(3U, 3U, 2U), Size2D(2U, 2U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(33U, 27U, 7U), Size2D(7U, 3U), PadStrideInfo(3, 2, 1, 0));
         // Asymmetric padding
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR));
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 1, 0, 2, DimensionRoundingType::FLOOR));
         // Ceil rounding
-        add_config(TensorShape(7U, 8U, 5U, 9U), Size2D(8U, 6U), PadStrideInfo(2, 3, 1, 1, 1, 3, DimensionRoundingType::CEIL), Size2D(1U, 2U));
+        add_config(TensorShape(7U, 8U, 5U, 9U), Size2D(8U, 6U), PadStrideInfo(2, 3, 1, 1, 1, 3, DimensionRoundingType::CEIL));
     }
 };
 
@@ -138,20 +138,50 @@ public:
     LargeDepthwiseConvolutionLayerDataset()
     {
         add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 4U), PadStrideInfo(1, 2, 0, 1));
-        add_config(TensorShape(17U, 31U, 2U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(17U, 31U, 2U), Size2D(13U, 9U), PadStrideInfo(1, 2, 1, 1));
         add_config(TensorShape(23U, 27U, 5U), Size2D(11U, 3U), PadStrideInfo(1, 2, 0, 0));
         add_config(TensorShape(17U, 31U, 2U, 3U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(1U, 1U), PadStrideInfo(2, 1, 0, 0));
-        add_config(TensorShape(333U, 277U, 77U), Size2D(1U, 1U), PadStrideInfo(3, 2, 1, 0));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 4U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 4U), PadStrideInfo(1, 2, 0, 0));
-        add_config(TensorShape(333U, 277U, 77U), Size2D(3U, 4U), PadStrideInfo(2, 3, 0, 1));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 4U), PadStrideInfo(2, 1, 1, 1));
+        add_config(TensorShape(133U, 127U, 55U), Size2D(1U, 1U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(233U, 109U, 77U), Size2D(1U, 1U), PadStrideInfo(3, 2, 1, 0));
+        add_config(TensorShape(177U, 111U, 22U), Size2D(3U, 4U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(233U, 87U, 55U), Size2D(3U, 4U), PadStrideInfo(1, 2, 0, 0));
+        add_config(TensorShape(333U, 79U, 77U), Size2D(3U, 4U), PadStrideInfo(2, 3, 0, 1));
+        add_config(TensorShape(67U, 211U, 22U), Size2D(3U, 4U), PadStrideInfo(2, 1, 1, 1));
         // Asymmetric padding
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR));
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
+        // Padding greater than kernel size.
+        add_config(TensorShape(128, 56, 56), Size2D(4, 4), PadStrideInfo(2, 2, 0, 10, 0, 10, DimensionRoundingType::FLOOR));
+    }
+};
+
+class LargeDepthwiseConvolutionLayerDatasetFp16Subset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    LargeDepthwiseConvolutionLayerDatasetFp16Subset()
+    {
+        add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 4U), PadStrideInfo(1, 2, 0, 1));
+        add_config(TensorShape(17U, 31U, 2U, 3U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(233U, 109U, 77U), Size2D(1U, 1U), PadStrideInfo(3, 2, 1, 0));
+        add_config(TensorShape(177U, 111U, 22U), Size2D(3U, 4U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(67U, 211U, 22U), Size2D(3U, 4U), PadStrideInfo(2, 1, 1, 1));
+        // Asymmetric padding
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
+        // Padding greater than kernel size.
+        add_config(TensorShape(128, 56, 56), Size2D(4, 4), PadStrideInfo(2, 2, 0, 10, 0, 10, DimensionRoundingType::FLOOR));
+    }
+};
+
+/** Dataset containing large kernel size for generic depthwise convolution. */
+class LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset()
+    {
+        add_config(TensorShape(6U, 210U, 8U), Size2D(4U, 194U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -186,22 +216,39 @@ class LargeDepthwiseConvolutionLayerDataset3x3 final : public DepthwiseConvoluti
 public:
     LargeDepthwiseConvolutionLayerDataset3x3()
     {
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1));
         add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 0));
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1));
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1));
         add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 0));
+        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 2));
+
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 2));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 1));
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1));
-        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 0));
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1));
-        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1));
-        add_config(TensorShape(233U, 277U, 55U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 0));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0));
-        add_config(TensorShape(333U, 277U, 77U, 5U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 1));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1));
-        // Width and height are a multipile of the processing tile size
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 2, 1));
+
+        add_config(TensorShape(77U, 209U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(123U, 76U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0));
+        add_config(TensorShape(133U, 277U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 0));
+        add_config(TensorShape(77U, 95U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1));
+
+        // Width and height are a multiple of the processing tile size
+        add_config(TensorShape(32U, 21U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 1));
+    }
+};
+
+class LargeDepthwiseConvolutionLayerDataset3x3Fp16Subset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    LargeDepthwiseConvolutionLayerDataset3x3Fp16Subset()
+    {
+        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 2));
+
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 2));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 1));
+
+        add_config(TensorShape(123U, 76U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0));
+        add_config(TensorShape(77U, 95U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1));
+
+        // Width and height are a multiple of the processing tile size
         add_config(TensorShape(32U, 21U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 1));
     }
 };
@@ -221,8 +268,6 @@ public:
         add_config(TensorShape(9U, 9U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
         add_config(TensorShape(9U, 9U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), Size2D(2U, 2U));
         add_config(TensorShape(9U, 9U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, DimensionRoundingType::CEIL));
-        // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
-        // add_config(TensorShape(9U, 9U, 1U), Size2D(3U, 3U), PadStrideInfo(2, 2, 2, 2, DimensionRoundingType::CEIL), Size2D(2U, 2U));
     }
 };
 /** Dataset containing optimized, 3x3 depthwise convolution shapes. */
@@ -232,14 +277,14 @@ public:
     LargeOptimizedDepthwiseConvolutionLayerDataset3x3()
     {
         // Stride 1
-        add_config(TensorShape(233U, 277U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
-        add_config(TensorShape(233U, 7U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
+        add_config(TensorShape(233U, 173U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
+        add_config(TensorShape(133U, 7U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
         add_config(TensorShape(7U, 7U, 21U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
         add_config(TensorShape(28U, 28U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
         add_config(TensorShape(28U, 28U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
         // Stride 2
-        add_config(TensorShape(233U, 277U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
-        add_config(TensorShape(233U, 277U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL));
+        add_config(TensorShape(133U, 97U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
+        add_config(TensorShape(153U, 77U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL));
         add_config(TensorShape(8U, 8U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::FLOOR));
         add_config(TensorShape(8U, 8U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
         add_config(TensorShape(8U, 8U, 33U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
@@ -260,14 +305,31 @@ public:
         add_config(TensorShape(7U, 7U, 16U), Size2D(5U, 5U), PadStrideInfo(1, 1, 4, 4, DimensionRoundingType::CEIL), Size2D(2U, 2U));
         // Stride 2
         add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
-        // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
-        // add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), Size2D(2U, 2U));
+        add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), Size2D(2U, 2U));
         add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 2, 2, 2, 2, DimensionRoundingType::CEIL));
-        // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
-        // add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 4, 4, 4, 4, DimensionRoundingType::CEIL), Size2D(2U, 2U));
+        add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 4, 4, 4, 4, DimensionRoundingType::CEIL), Size2D(2U, 2U));
+    }
+};
+
+/** Dataset containing in-place 1x1 depthwise convolution shapes.
+ *
+ * For a depthwise convolution op to be in-place:
+ * * Output has the same shape as the input;
+ *      * 1x1 filter
+ *      * stride == 1
+ *      * dilations == 1
+ *      * No paddings
+*/
+class SmallInPlaceDepthwiseConvolutionLayerDataset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    SmallInPlaceDepthwiseConvolutionLayerDataset()
+    {
+        add_config(TensorShape(7U, 7U, 1U), Size2D(1U, 1U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(11U, 13U, 16U), Size2D(1U, 1U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DEPTHWISE_CONVOLUTION_DATASET */
-\ No newline at end of file
+#endif // ACL_TESTS_DATASETS_DEPTHWISECONVOLUTIONLAYERDATASET_H
diff --git a/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h b/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h
index 9e2a3cf548..a58650a5e4 100644
--- a/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_DILATED_CONVOLUTION_LAYER_DATASET
-#define ARM_COMPUTE_TEST_DILATED_CONVOLUTION_LAYER_DATASET
+#ifndef ACL_TESTS_DATASETS_DILATEDDEPTHWISECONVOLUTIONLAYERDATASET_H
+#define ACL_TESTS_DATASETS_DILATEDDEPTHWISECONVOLUTIONLAYERDATASET_H
 
 #include "utils/TypePrinter.h"
 
@@ -48,6 +48,7 @@ public:
         add_config(TensorShape(7U, 7U, 1U), Size2D(3U, 2U), PadStrideInfo(1, 1, 0, 0), Size2D(2U, 1U));
         add_config(TensorShape(7U, 7U, 1U), Size2D(3U, 2U), PadStrideInfo(2, 1, 0, 0), Size2D(2U, 2U));
         add_config(TensorShape(7U, 7U, 1U), Size2D(3U, 2U), PadStrideInfo(2, 2, 0, 0), Size2D(1U, 2U));
+        add_config(TensorShape(7U, 8U, 5U, 9U), Size2D(8U, 6U), PadStrideInfo(2, 3, 1, 1, 1, 3, DimensionRoundingType::CEIL), Size2D(1U, 2U));
 
         add_config(TensorShape(7U, 8U, 1U), Size2D(2U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(2U, 2U));
         add_config(TensorShape(23U, 27U, 5U), Size2D(3U, 5U), PadStrideInfo(2, 1, 0, 0), Size2D(2U, 1U));
@@ -96,15 +97,16 @@ public:
     LargeDepthwiseDilatedConvolutionLayerDataset()
     {
         add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1), Size2D(2U, 1U));
-        add_config(TensorShape(17U, 31U, 2U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1), Size2D(1U, 2U));
         add_config(TensorShape(23U, 27U, 5U), Size2D(11U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(1U, 3U));
+        add_config(TensorShape(17U, 31U, 2U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1), Size2D(1U, 2U));
         add_config(TensorShape(17U, 31U, 2U, 3U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 2U));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 0), Size2D(2U, 2U));
-        add_config(TensorShape(333U, 277U, 77U), Size2D(3U, 3U), PadStrideInfo(3, 2, 1, 0), Size2D(3U, 2U));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 2U));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(5U, 2U));
-        add_config(TensorShape(333U, 277U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 1), Size2D(2U, 2U));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(2U, 5U));
+
+        add_config(TensorShape(133U, 177U, 55U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 0), Size2D(2U, 2U));
+        add_config(TensorShape(233U, 177U, 77U), Size2D(3U, 3U), PadStrideInfo(3, 2, 1, 0), Size2D(3U, 2U));
+        add_config(TensorShape(77U, 211U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 2U));
+        add_config(TensorShape(133U, 177U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(5U, 2U));
+        add_config(TensorShape(233U, 177U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 1), Size2D(2U, 2U));
+        add_config(TensorShape(177U, 211U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(2U, 5U));
         // Asymmetric padding
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR), Size2D(3U, 2U));
         add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR), Size2D(4U, 4U));
@@ -113,6 +115,24 @@ public:
     }
 };
 
+class LargeDepthwiseDilatedConvolutionLayerDatasetFp16Subset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    LargeDepthwiseDilatedConvolutionLayerDatasetFp16Subset()
+    {
+        add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1), Size2D(2U, 1U));
+        add_config(TensorShape(23U, 27U, 5U), Size2D(11U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(1U, 3U));
+        add_config(TensorShape(17U, 31U, 2U, 3U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 2U));
+
+        add_config(TensorShape(77U, 211U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 2U));
+        add_config(TensorShape(133U, 177U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(5U, 2U));
+        add_config(TensorShape(177U, 211U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(2U, 5U));
+        // Asymmetric padding
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR), Size2D(4U, 4U));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR), Size2D(2U, 2U));
+    }
+};
+
 /** Dataset containing large, 3x3 depthwise convolution shapes with dilation. */
 class LargeDepthwiseDilatedConvolutionLayerDataset3x3 final : public DepthwiseConvolutionLayerDataset
 {
@@ -120,24 +140,44 @@ public:
     LargeDepthwiseDilatedConvolutionLayerDataset3x3()
     {
         add_config(TensorShape(32U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 1), Size2D(2U, 1U));
+
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1), Size2D(2U, 2U));
-        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 0), Size2D(2U, 2U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1), Size2D(2U, 1U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(2U, 3U));
-        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 0), Size2D(2U, 1U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 1), Size2D(3U, 3U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(2U, 2U));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1), Size2D(4U, 4U));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1), Size2D(2U, 5U));
+
+        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 0), Size2D(2U, 2U));
+        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 0), Size2D(2U, 1U));
         add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 0), Size2D(2U, 2U));
+
+        add_config(TensorShape(133U, 177U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(5U, 5U));
+        add_config(TensorShape(233U, 77U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 0), Size2D(4U, 4U));
+        add_config(TensorShape(77U, 211U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(4U, 4U));
+        add_config(TensorShape(77U, 111U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(3U, 3U));
+    }
+};
+
+class LargeDepthwiseDilatedConvolutionLayerDataset3x3Fp16Subset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    LargeDepthwiseDilatedConvolutionLayerDataset3x3Fp16Subset()
+    {
+        add_config(TensorShape(32U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 1), Size2D(2U, 1U));
+
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1), Size2D(2U, 1U));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 1), Size2D(3U, 3U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1), Size2D(4U, 4U));
         add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1), Size2D(2U, 5U));
-        add_config(TensorShape(233U, 277U, 55U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 0), Size2D(3U, 3U));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1), Size2D(4U, 4U));
-        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0), Size2D(5U, 5U));
-        add_config(TensorShape(333U, 277U, 77U, 5U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 1), Size2D(4U, 4U));
-        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(3U, 3U));
+
+        add_config(TensorShape(21U, 31U, 9U, 10U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 0), Size2D(2U, 2U));
+
+        add_config(TensorShape(77U, 111U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1), Size2D(3U, 3U));
     }
 };
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DILATED_CONVOLUTION_LAYER_DATASET */
-\ No newline at end of file
+#endif // ACL_TESTS_DATASETS_DILATEDDEPTHWISECONVOLUTIONLAYERDATASET_H
diff --git a/tests/datasets/DynamicFusionDataset.h b/tests/datasets/DynamicFusionDataset.h
new file mode 100644
index 0000000000..5a1453b9ab
--- /dev/null
+++ b/tests/datasets/DynamicFusionDataset.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef TESTS_DATASETS_DYNAMICFUSIONDATASET
+#define TESTS_DATASETS_DYNAMICFUSIONDATASET
+
+#include "utils/TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class DynamicFusionThreeInputs
+{
+public:
+    using type = std::tuple<TensorShape, TensorShape, TensorShape>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator shape0_it,
+                 std::vector<TensorShape>::const_iterator shape1_it,
+                 std::vector<TensorShape>::const_iterator shape2_it)
+            : _shape0_it{ std::move(shape0_it) },
+              _shape1_it{ std::move(shape1_it) },
+              _shape2_it{ std::move(shape2_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "shape0=" << *_shape0_it << ":";
+            description << "shape1=" << *_shape1_it << ":";
+            description << "shape2=" << *_shape2_it << ":";
+
+            return description.str();
+        }
+
+        DynamicFusionThreeInputs::type operator*() const
+        {
+            return std::make_tuple(*_shape0_it, *_shape1_it, *_shape2_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_shape0_it;
+            ++_shape1_it;
+            ++_shape2_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator _shape0_it;
+        std::vector<TensorShape>::const_iterator _shape1_it;
+        std::vector<TensorShape>::const_iterator _shape2_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_shape0_shapes.begin(), _shape1_shapes.begin(), _shape2_shapes.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_shape0_shapes.size(), std::min(_shape1_shapes.size(), _shape2_shapes.size()));
+    }
+
+    void add_config(TensorShape shape0, TensorShape shape1, TensorShape shape2)
+    {
+        _shape0_shapes.emplace_back(std::move(shape0));
+        _shape1_shapes.emplace_back(std::move(shape1));
+        _shape2_shapes.emplace_back(std::move(shape2));
+    }
+
+protected:
+    DynamicFusionThreeInputs()                            = default;
+    DynamicFusionThreeInputs(DynamicFusionThreeInputs &&) = default;
+
+private:
+    std::vector<TensorShape> _shape0_shapes{};
+    std::vector<TensorShape> _shape1_shapes{};
+    std::vector<TensorShape> _shape2_shapes{};
+};
+
+class DynamicFusionElementwiseBinaryTwoOpsSmallShapes final : public DynamicFusionThreeInputs
+{
+public:
+    DynamicFusionElementwiseBinaryTwoOpsSmallShapes()
+    {
+        add_config(TensorShape{ 9U, 9U, 5U }, TensorShape{ 9U, 9U, 5U }, TensorShape{ 9U, 9U, 5U });
+        add_config(TensorShape{ 9U, 9U, 5U }, TensorShape{ 1U, 1U, 1U } /* Broadcast in X, Y, Z*/, TensorShape{ 9U, 9U, 5U });
+        add_config(TensorShape{ 27U, 13U, 2U }, TensorShape{ 27U, 1U, 1U } /* Broadcast in Y and Z*/, TensorShape{ 27U, 13U, 2U });
+        add_config(TensorShape{ 27U, 13U, 2U }, TensorShape{ 27U, 13U, 2U }, TensorShape{ 27U, 1U, 1U } /* Broadcast in Y and Z*/);
+    }
+};
+
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* TESTS_DATASETS_DYNAMICFUSIONDATASET */
diff --git a/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h b/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
index 7ab068c211..b0ad4879ba 100644
--- a/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
+++ b/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMMLOWPOUTPUT_DATASET
-#define ARM_COMPUTE_TEST_GEMMLOWPOUTPUT_DATASET
+#ifndef ACL_TESTS_DATASETS_GEMMLOWPFUSEDOFFSETOUTPUTDATASET_H
+#define ACL_TESTS_DATASETS_GEMMLOWPFUSEDOFFSETOUTPUTDATASET_H
 
 #include "utils/TypePrinter.h"
 
@@ -40,21 +40,17 @@ namespace datasets
 class GEMMLowpFusedOffsetOutputDataset
 {
 public:
-    using type = std::tuple<TensorShape, TensorShape, TensorShape, int32_t, int32_t, GEMMLowpOutputStageInfo>;
+    using type = std::tuple<TensorShape, TensorShape, TensorShape, GEMMLowpOutputStageType>;
 
     struct iterator
     {
         iterator(std::vector<TensorShape>::const_iterator             a_it,
                  std::vector<TensorShape>::const_iterator             b_it,
                  std::vector<TensorShape>::const_iterator             c_it,
-                 std::vector<int32_t>::const_iterator                 a_offset_it,
-                 std::vector<int32_t>::const_iterator                 b_offset_it,
-                 std::vector<GEMMLowpOutputStageInfo>::const_iterator output_stage_it)
+                 std::vector<GEMMLowpOutputStageType>::const_iterator output_stage_it)
             : _a_it{ std::move(a_it) },
               _b_it{ std::move(b_it) },
               _c_it{ std::move(c_it) },
-              _a_offset_it{ std::move(a_offset_it) },
-              _b_offset_it{ std::move(b_offset_it) },
               _output_stage_it{ std::move(output_stage_it) }
         {
         }
@@ -65,33 +61,14 @@ public:
             description << "A=" << *_a_it << ":";
             description << "B=" << *_b_it << ":";
             description << "C=" << *_c_it << ":";
-            description << "a_offset=" << *_a_offset_it << ":";
-            description << "b_offset=" << *_b_offset_it << ":";
-            description << "output_type=" << string_from_gemmlowp_output_stage((*_output_stage_it).type) << ":";
-            description << "output_offset=" << (*_output_stage_it).gemmlowp_offset << ":";
-            description << "output_multiplier={";
-            for(auto it = (*_output_stage_it).gemmlowp_multipliers.begin(); it != (*_output_stage_it).gemmlowp_multipliers.end(); ++it)
-            {
-                description << (*it) << ", ";
-            }
-            description << "}:";
-            description << "output_shift={";
-
-            for(auto it = (*_output_stage_it).gemmlowp_shifts.begin(); it != (*_output_stage_it).gemmlowp_shifts.end(); ++it)
-            {
-                description << (*it) << ", ";
-            }
-            description << "}:";
-            description << "output_min=" << (*_output_stage_it).gemmlowp_min_bound << ":";
-            description << "output_max=" << (*_output_stage_it).gemmlowp_max_bound << ":";
-            description << "is_quantized_per_channel=" << (*_output_stage_it).is_quantized_per_channel << ":";
+            description << "output_type=" << string_from_gemmlowp_output_stage(*_output_stage_it) << ":";
 
             return description.str();
         }
 
         GEMMLowpFusedOffsetOutputDataset::type operator*() const
         {
-            return std::make_tuple(*_a_it, *_b_it, *_c_it, *_a_offset_it, *_b_offset_it, *_output_stage_it);
+            return std::make_tuple(*_a_it, *_b_it, *_c_it, *_output_stage_it);
         }
 
         iterator &operator++()
@@ -99,8 +76,6 @@ public:
             ++_a_it;
             ++_b_it;
             ++_c_it;
-            ++_a_offset_it;
-            ++_b_offset_it;
             ++_output_stage_it;
 
             return *this;
@@ -110,45 +85,27 @@ public:
         std::vector<TensorShape>::const_iterator             _a_it;
         std::vector<TensorShape>::const_iterator             _b_it;
         std::vector<TensorShape>::const_iterator             _c_it;
-        std::vector<int32_t>::const_iterator                 _a_offset_it;
-        std::vector<int32_t>::const_iterator                 _b_offset_it;
-        std::vector<GEMMLowpOutputStageInfo>::const_iterator _output_stage_it;
+        std::vector<GEMMLowpOutputStageType>::const_iterator _output_stage_it;
     };
 
     iterator begin() const
     {
-        return iterator(_a_shapes.begin(), _b_shapes.begin(), _c_shapes.begin(), _a_offset.begin(), _b_offset.begin(), _output_stage.begin());
+        return iterator(_a_shapes.begin(), _b_shapes.begin(), _c_shapes.begin(), _output_stage.begin());
     }
 
     int size() const
     {
-        return std::min(_a_shapes.size(), std::min(_b_shapes.size(), std::min(_c_shapes.size(), std::min(_a_offset.size(), std::min(_b_offset.size(), _output_stage.size())))));
+        return std::min(_a_shapes.size(), std::min(_b_shapes.size(), std::min(_c_shapes.size(), _output_stage.size())));
     }
 
-    void add_config(TensorShape a, TensorShape b, TensorShape c, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+    void add_config(TensorShape a, TensorShape b, TensorShape c, GEMMLowpOutputStageType output_stage)
     {
         _a_shapes.emplace_back(std::move(a));
         _b_shapes.emplace_back(std::move(b));
         _c_shapes.emplace_back(std::move(c));
-        _a_offset.emplace_back(std::move(a_offset));
-        _b_offset.emplace_back(std::move(b_offset));
         _output_stage.emplace_back(std::move(output_stage));
     }
 
-    GEMMLowpOutputStageInfo OutputStageInfo(GEMMLowpOutputStageType type, int32_t offset, int32_t multiplier, int32_t shift, int32_t min, int32_t max)
-    {
-        GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
-        output_stage.type                    = type;
-        output_stage.gemmlowp_offset         = offset;
-        output_stage.gemmlowp_multiplier     = multiplier;
-        output_stage.gemmlowp_shift          = shift;
-        output_stage.gemmlowp_min_bound      = min;
-        output_stage.gemmlowp_max_bound      = max;
-        output_stage.gemmlowp_multipliers.push_back(multiplier);
-        output_stage.gemmlowp_shifts.push_back(shift);
-        return output_stage;
-    }
-
 protected:
     GEMMLowpFusedOffsetOutputDataset()                                    = default;
     GEMMLowpFusedOffsetOutputDataset(GEMMLowpFusedOffsetOutputDataset &&) = default;
@@ -157,9 +114,7 @@ private:
     std::vector<TensorShape>             _a_shapes{};
     std::vector<TensorShape>             _b_shapes{};
     std::vector<TensorShape>             _c_shapes{};
-    std::vector<int32_t>                 _a_offset{};
-    std::vector<int32_t>                 _b_offset{};
-    std::vector<GEMMLowpOutputStageInfo> _output_stage{};
+    std::vector<GEMMLowpOutputStageType> _output_stage{};
 };
 
 class SmallGEMMLowpFusedOffsetOutputUint8Dataset final : public GEMMLowpFusedOffsetOutputDataset
@@ -167,45 +122,72 @@ class SmallGEMMLowpFusedOffsetOutputUint8Dataset final : public GEMMLowpFusedOff
 public:
     SmallGEMMLowpFusedOffsetOutputUint8Dataset()
     {
-        add_config(TensorShape(21U, 13U), TensorShape(1U, 21U), TensorShape(1U, 13U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -100, 2, 13, 10, 210));
-        add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 100, 2, 13, 10, 210));
-        add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 18, 23, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 200, 2, 13, 10, 210));
-        add_config(TensorShape(32U, 72U), TensorShape(16U, 32U), TensorShape(16U, 72U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -100, 2, 13, 10, 210));
-
-        add_config(TensorShape(21U, 1U), TensorShape(43U, 21U), TensorShape(43U, 1U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601600, 10, 10, 210));
-        add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 254601600, 10, 10, 210));
-        add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 2, 254601602, 10, 10, 210));
-        add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601602, 10, 10, 210));
+        add_config(TensorShape(21U, 13U), TensorShape(1U, 21U), TensorShape(1U, 13U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(32U, 72U), TensorShape(16U, 32U), TensorShape(16U, 72U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(21U, 1U), TensorShape(43U, 21U), TensorShape(43U, 1U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
     }
 };
 
-class SmallGEMMLowpFusedOffsetOutputInt8Dataset final : public GEMMLowpFusedOffsetOutputDataset
+class SmallGEMMLowpFusedBatchedMatMulDataset final : public GEMMLowpFusedOffsetOutputDataset
 {
 public:
-    SmallGEMMLowpFusedOffsetOutputInt8Dataset()
+    SmallGEMMLowpFusedBatchedMatMulDataset()
+    {
+        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U, 15U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U, 17U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U, 4U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(15U, 7U, 3U), TensorShape(29U, 15U, 3U), TensorShape(29U, 7U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U, 17U, 32U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U, 256U, 32U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+    }
+};
+
+class SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset final : public GEMMLowpFusedOffsetOutputDataset
+{
+public:
+    SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset()
+    {
+        add_config(TensorShape(21U, 1421U, 33U), TensorShape(34U, 21U), TensorShape(34U, 7U, 203U, 33U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(31U, 102U, 55U), TensorShape(23U, 31U), TensorShape(23U, 1U, 102U, 55U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(38U, 1200U, 77U), TensorShape(21U, 38U), TensorShape(21U, 4U, 300U, 77U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(32U, 103U, 99U), TensorShape(17U, 32U), TensorShape(17U, 1U, 103U, 99U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(16U, 1600U, 111U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 111U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(16U, 1600U, 113U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 113U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+    }
+};
+
+class SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset final : public GEMMLowpFusedOffsetOutputDataset
+{
+public:
+    SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset()
     {
-        add_config(TensorShape(21U, 1U), TensorShape(1U, 21U), TensorShape(1U, 1U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -50, 2, 13, -10, 110));
-        add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, -10, 110));
-        add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, -10, 110));
-        add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -40, 2, 13, -10, 110));
-
-        add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601600, 10, -10, 110));
-        add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 1, 254601600, 10, -10, 110));
-        add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601602, 10, -10, 110));
-        add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601602, 10, -10, 110));
+        add_config(TensorShape(21U, 7U, 203U, 33U), TensorShape(34U, 21U), TensorShape(34U, 7U, 203U, 33U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(31U, 1U, 102U, 55U), TensorShape(23U, 31U), TensorShape(23U, 1U, 102U, 55U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(38U, 4U, 300U, 77U), TensorShape(21U, 38U), TensorShape(21U, 4U, 300U, 77U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(32U, 1U, 103U, 99U), TensorShape(17U, 32U), TensorShape(17U, 1U, 103U, 99U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(16U, 8U, 200U, 111U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 111U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(16U, 8U, 200U, 113U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 113U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
     }
 };
 
-class SmallGEMMLowpFusedOffsetOutputPerChannelDataset final : public GEMMLowpFusedOffsetOutputDataset
+class SmallGEMMLowpFusedOffsetOutputInt8Dataset final : public GEMMLowpFusedOffsetOutputDataset
 {
 public:
-    SmallGEMMLowpFusedOffsetOutputPerChannelDataset()
+    SmallGEMMLowpFusedOffsetOutputInt8Dataset()
     {
-        add_config(TensorShape(21U, 1U, 6U), TensorShape(43U, 21U, 6U), TensorShape(43U, 1U, 6U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -200, 2, 13, 10, 210));
-        add_config(TensorShape(21U, 13U, 3U), TensorShape(33U, 21U, 3U), TensorShape(33U, 13U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -100, 2, 13, 10, 210));
-        add_config(TensorShape(31U, 3U, 2U), TensorShape(72U, 31U, 2U), TensorShape(72U, 3U, 2U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, 10, 210));
-        add_config(TensorShape(52U, 13U, 7U), TensorShape(33U, 52U, 7U), TensorShape(33U, 13U, 7U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 100, 2, 13, 10, 210));
-        add_config(TensorShape(52U, 26U, 8U), TensorShape(33U, 52U, 8U), TensorShape(33U, 26U, 8U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, 10, 210));
+        add_config(TensorShape(21U, 1U), TensorShape(1U, 21U), TensorShape(1U, 1U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
     }
 };
 
@@ -214,15 +196,12 @@ class LargeGEMMLowpFusedOffsetOutputUint8Dataset final : public GEMMLowpFusedOff
 public:
     LargeGEMMLowpFusedOffsetOutputUint8Dataset()
     {
-        add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -100, 2, 18, 10, 210));
-        add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 100, 2, 18, 10, 210));
-        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 200, 2, 18, 10, 210));
-        add_config(TensorShape(941U, 1011U), TensorShape(623U, 941U), TensorShape(623U, 1011U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -100, 2, 18, 10, 210));
-
-        add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601600, 15, 10, 210));
-        add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 1, 254601600, 15, 10, 210));
-        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601602, 15, 10, 210));
-        add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601602, 15, 10, 210));
+        add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(941U, 1011U), TensorShape(623U, 941U), TensorShape(623U, 1011U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+
     }
 };
 
@@ -231,18 +210,17 @@ class LargeGEMMLowpFusedOffsetOutputInt8Dataset final : public GEMMLowpFusedOffs
 public:
     LargeGEMMLowpFusedOffsetOutputInt8Dataset()
     {
-        add_config(TensorShape(923U, 1U, 15U), TensorShape(871U, 923U, 15U), TensorShape(871U, 1U, 15U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -50, 2, 18, -10, 110));
-        add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 18, -10, 110));
-        add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 18, -10, 110));
-        add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -50, 2, 18, -10, 110));
-
-        add_config(TensorShape(923U, 1U), TensorShape(871U, 923U), TensorShape(871U, 1U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601600, 15, -10, 110));
-        add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 254601600, 15, -10, 110));
-        add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 2, 254601602, 15, -10, 110));
-        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601602, 15, -10, 110));
+        add_config(TensorShape(923U, 1U, 15U), TensorShape(871U, 923U, 15U), TensorShape(871U, 1U, 15U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(923U, 1U), TensorShape(871U, 923U), TensorShape(871U, 1U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
     }
 };
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMLOWPOUTPUT_DATASET */
+#endif // ACL_TESTS_DATASETS_GEMMLOWPFUSEDOFFSETOUTPUTDATASET_H
diff --git a/tests/datasets/GatherDataset.h b/tests/datasets/GatherDataset.h
index 29a99d5239..74ea3b4a06 100644
--- a/tests/datasets/GatherDataset.h
+++ b/tests/datasets/GatherDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -106,6 +106,64 @@ private:
     std::vector<int>         _axis{};
 };
 
+class SmallGatherMultiDimIndicesDataset final : public GatherDataset
+{
+public:
+    SmallGatherMultiDimIndicesDataset()
+    {
+        add_config(TensorShape(2U, 6U), TensorShape(4U, 9U), 1);
+        add_config(TensorShape(15U, 15U), TensorShape(3U, 2U, 2U), 1);
+        add_config(TensorShape(15U, 15U), TensorShape(2U, 11U), 1);
+        add_config(TensorShape(5U, 3U, 4U), TensorShape(2U, 7U), 1);
+        add_config(TensorShape(1U, 5U, 3U), TensorShape(1U, 7U, 3U), 1);
+
+        add_config(TensorShape(3U, 5U), TensorShape(2U, 3U), 0);
+        add_config(TensorShape(9U), TensorShape(3U, 2U, 4U), 0);
+        add_config(TensorShape(5U, 3U, 4U), TensorShape(5U, 6U), 0);
+
+        add_config(TensorShape(7U, 4U, 5U), TensorShape(2U, 3U), 2);
+        add_config(TensorShape(8U, 2U, 3U), TensorShape(4U, 2U, 5U), 2);
+    }
+};
+
+class CLSmallGatherMultiDimIndicesDataset final : public GatherDataset
+{
+public:
+    CLSmallGatherMultiDimIndicesDataset()
+    {
+        add_config(TensorShape(2U, 6U), TensorShape(4U, 9U), 0);
+        add_config(TensorShape(15U, 15U), TensorShape(3U, 2U, 2U), 0);
+        add_config(TensorShape(15U, 15U), TensorShape(2U, 11U), 0);
+        add_config(TensorShape(5U, 3U, 4U), TensorShape(2U, 7U), 0);
+
+        add_config(TensorShape(3U, 5U), TensorShape(2U, 3U), 0);
+        add_config(TensorShape(9U), TensorShape(3U, 2U, 4U), 0);
+        add_config(TensorShape(5U, 3U, 4U), TensorShape(5U, 6U), 0);
+
+        add_config(TensorShape(7U, 4U, 5U), TensorShape(2U, 3U),0);
+
+        add_config(TensorShape(2U, 6U), TensorShape(4U, 9U), 1);
+        add_config(TensorShape(15U, 15U), TensorShape(3U, 2U, 2U), 1);
+        add_config(TensorShape(15U, 15U), TensorShape(2U, 11U), 1);
+        add_config(TensorShape(5U, 3U, 4U), TensorShape(2U, 7U), 1);
+
+        add_config(TensorShape(3U, 5U), TensorShape(2U, 3U), 1);
+        add_config(TensorShape(9U), TensorShape(3U, 2U, 4U), 1);
+        add_config(TensorShape(5U, 3U, 4U), TensorShape(5U, 6U), 1);
+
+        add_config(TensorShape(7U, 4U, 5U), TensorShape(2U, 3U),1);
+
+        add_config(TensorShape(2U, 6U), TensorShape(4U, 9U), 2);
+        add_config(TensorShape(15U, 15U), TensorShape(2U, 11U), 2);
+        add_config(TensorShape(5U, 3U, 4U), TensorShape(2U, 7U), 2);
+
+        add_config(TensorShape(3U, 5U), TensorShape(2U, 3U), 2);
+        add_config(TensorShape(5U, 3U, 4U), TensorShape(5U, 6U), 2);
+
+        add_config(TensorShape(7U, 4U, 5U), TensorShape(2U, 3U),2);
+    }
+};
+
 class SmallGatherDataset final : public GatherDataset
 {
 public:
diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index 1cffc9a221..c299f2460b 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_LARGE_CONVOLUTION_LAYER_DATASET
-#define ARM_COMPUTE_TEST_LARGE_CONVOLUTION_LAYER_DATASET
+#ifndef ACL_TESTS_DATASETS_LARGECONVOLUTIONLAYERDATASET_H
+#define ACL_TESTS_DATASETS_LARGECONVOLUTIONLAYERDATASET_H
 
 #include "tests/datasets/ConvolutionLayerDataset.h"
 
@@ -44,18 +44,31 @@ public:
     {
         // Kernel size 3
         // Batch size 1
-        add_config(TensorShape(224U, 222U, 64U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(112U, 113U, 64U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(112U, 112U, 128U), TensorShape(3U, 3U, 128U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 3U, 125U, 256U), TensorShape(256U), TensorShape(51U, 54U, 256U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(56U, 56U, 256U), TensorShape(3U, 3U, 256U, 256U), TensorShape(256U), TensorShape(54U, 54U, 256U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 3U, 257U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(28U, 28U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(12U, 12U, 512U), PadStrideInfo(1, 1, 0, 0));
-        // Batch size 3, 2 and 4
-        add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(110U, 111U, 128U, 2U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(224U, 222U, 32U), TensorShape(3U, 3U, 32U, 32U), TensorShape(32U), TensorShape(224U, 222U, 32U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 113U, 32U), TensorShape(3U, 3U, 32U, 64U), TensorShape(64U), TensorShape(112U, 113U, 64U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 112U, 64U), TensorShape(3U, 3U, 64U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 3U, 125U, 128U), TensorShape(128U), TensorShape(51U, 54U, 128U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(56U, 56U, 128U), TensorShape(3U, 3U, 128U, 128U), TensorShape(128U), TensorShape(54U, 54U, 128U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 3U, 257U, 128U), TensorShape(128U), TensorShape(28U, 28U, 128U), PadStrideInfo(1, 1, 1, 1));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 1, 1));
+    }
+};
+
+class LargeWinogradConvolutionLayer3x3DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer3x3DatasetFp16Subset()
+    {
+        // Kernel size 3
+        // Batch size 1
+        add_config(TensorShape(224U, 222U, 32U), TensorShape(3U, 3U, 32U, 32U), TensorShape(32U), TensorShape(224U, 222U, 32U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 112U, 64U), TensorShape(3U, 3U, 64U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(56U, 56U, 128U), TensorShape(3U, 3U, 128U, 128U), TensorShape(128U), TensorShape(54U, 54U, 128U), PadStrideInfo(1, 1, 0, 0));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 1, 1));
     }
 };
 
@@ -66,18 +79,31 @@ public:
     {
         // Kernel size 3
         // Batch size 1
-        add_config(TensorShape(224U, 222U, 64U), TensorShape(3U, 1U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(112U, 113U, 64U), TensorShape(3U, 1U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(112U, 112U, 128U), TensorShape(3U, 1U, 128U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 1U, 125U, 256U), TensorShape(256U), TensorShape(51U, 56U, 256U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(56U, 56U, 256U), TensorShape(3U, 1U, 256U, 256U), TensorShape(256U), TensorShape(56U, 56U, 256U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 1U, 257U, 512U), TensorShape(512U), TensorShape(26U, 28U, 512U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(28U, 28U, 512U), TensorShape(3U, 1U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 1U, 512U, 512U), TensorShape(512U), TensorShape(12U, 14U, 512U), PadStrideInfo(1, 1, 0, 0));
-        // Batch size 3, 2 and 4
-        add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(3U, 1U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 1, 0));
-        add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(3U, 1U, 64U, 128U), TensorShape(128U), TensorShape(110U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 1U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(224U, 222U, 32U), TensorShape(3U, 1U, 32U, 32U), TensorShape(32U), TensorShape(224U, 222U, 32U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(112U, 113U, 32U), TensorShape(3U, 1U, 32U, 64U), TensorShape(64U), TensorShape(112U, 113U, 64U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(112U, 112U, 64U), TensorShape(3U, 1U, 64U, 129U), TensorShape(129U), TensorShape(112U, 112U, 129U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 1U, 125U, 128U), TensorShape(128U), TensorShape(51U, 56U, 128U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(56U, 56U, 128U), TensorShape(3U, 1U, 128U, 128U), TensorShape(128U), TensorShape(56U, 56U, 128U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 1U, 257U, 128U), TensorShape(128U), TensorShape(26U, 28U, 128U), PadStrideInfo(1, 1, 0, 0));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 1U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 1, 0));
+    }
+};
+
+class LargeWinogradConvolutionLayer3x1DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer3x1DatasetFp16Subset()
+    {
+        // Kernel size 3
+        // Batch size 1
+        add_config(TensorShape(112U, 113U, 32U), TensorShape(3U, 1U, 32U, 64U), TensorShape(64U), TensorShape(112U, 113U, 64U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 1U, 125U, 128U), TensorShape(128U), TensorShape(51U, 56U, 128U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 1U, 257U, 128U), TensorShape(128U), TensorShape(26U, 28U, 128U), PadStrideInfo(1, 1, 0, 0));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 1U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 1, 0));
     }
 };
 
@@ -88,18 +114,31 @@ public:
     {
         // Kernel size 3
         // Batch size 1
-        add_config(TensorShape(224U, 222U, 64U), TensorShape(1U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(112U, 113U, 64U), TensorShape(1U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(112U, 112U, 128U), TensorShape(1U, 3U, 128U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(53U, 56U, 125U), TensorShape(1U, 3U, 125U, 256U), TensorShape(256U), TensorShape(53U, 56U, 256U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(56U, 56U, 256U), TensorShape(1U, 3U, 256U, 256U), TensorShape(256U), TensorShape(56U, 54U, 256U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(28U, 28U, 257U), TensorShape(1U, 3U, 257U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(28U, 28U, 512U), TensorShape(1U, 3U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(14U, 14U, 512U), TensorShape(1U, 3U, 512U, 512U), TensorShape(512U), TensorShape(14U, 12U, 512U), PadStrideInfo(1, 1, 0, 0));
-        // Batch size 3, 2 and 4
-        add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(1U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(1U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 1));
-        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(1U, 3U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(224U, 222U, 32U), TensorShape(1U, 3U, 32U, 32U), TensorShape(32U), TensorShape(224U, 222U, 32U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(112U, 113U, 32U), TensorShape(1U, 3U, 32U, 64U), TensorShape(64U), TensorShape(112U, 113U, 64U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(112U, 112U, 64U), TensorShape(1U, 3U, 64U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(1U, 3U, 125U, 128U), TensorShape(128U), TensorShape(53U, 56U, 128U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(56U, 56U, 128U), TensorShape(1U, 3U, 128U, 128U), TensorShape(128U), TensorShape(56U, 54U, 128U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(1U, 3U, 257U, 128U), TensorShape(128U), TensorShape(28U, 28U, 128U), PadStrideInfo(1, 1, 0, 1));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(1U, 3U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 0, 1));
+    }
+};
+
+class LargeWinogradConvolutionLayer1x3DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer1x3DatasetFp16Subset()
+    {
+        // Kernel size 3
+        // Batch size 1
+        add_config(TensorShape(112U, 112U, 64U), TensorShape(1U, 3U, 64U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(1U, 3U, 125U, 128U), TensorShape(128U), TensorShape(53U, 56U, 128U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(1U, 3U, 257U, 128U), TensorShape(128U), TensorShape(28U, 28U, 128U), PadStrideInfo(1, 1, 0, 1));
+
+        // Batch > 1
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(1U, 3U, 127U, 64U), TensorShape(64U), TensorShape(111U, 112U, 64U, 4U), PadStrideInfo(1, 1, 0, 1));
     }
 };
 
@@ -110,15 +149,27 @@ public:
     {
         // Kernel size 5
         // Batch size 1
-        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 5U, 3U, 64U), TensorShape(64U), TensorShape(220U, 220U, 64U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(123U, 134U, 16U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U), PadStrideInfo(1, 1, 2, 2));
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 5U, 3U, 32U), TensorShape(32U), TensorShape(220U, 220U, 32U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(181U, 152U, 42U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 5U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 2));
 
-        // Batch size 2, 3 and 4
-        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(5U, 5U, 3U, 64U), TensorShape(64U), TensorShape(220U, 220U, 64U, 2U), PadStrideInfo(1, 1, 0, 0));
+        // Batch > 1
+        add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 2, 2));
+    }
+};
+
+class LargeWinogradConvolutionLayer5x5DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer5x5DatasetFp16Subset()
+    {
+        // Kernel size 5
+        // Batch size 1
+        add_config(TensorShape(181U, 152U, 42U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 5U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 2));
+
+        // Batch > 1
         add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 2, 2));
-        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -128,15 +179,26 @@ public:
     LargeWinogradConvolutionLayer5x1Dataset()
     {
         // Batch size 1
-        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 1U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U), PadStrideInfo(1, 1, 2, 0));
-        add_config(TensorShape(123U, 134U, 16U), TensorShape(5U, 1U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U), PadStrideInfo(1, 1, 2, 0));
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 1U, 3U, 32U), TensorShape(32U), TensorShape(224U, 224U, 32U), PadStrideInfo(1, 1, 2, 0));
         add_config(TensorShape(181U, 152U, 42U), TensorShape(5U, 1U, 42U, 100U), TensorShape(100U), TensorShape(177U, 152U, 100U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 1U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 0));
 
-        // Batch size 2, 3 and 4
-        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(5U, 1U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U, 2U), PadStrideInfo(1, 1, 2, 0));
+        // Batch > 1
+        add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 1U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 2, 0));
+    }
+};
+
+class LargeWinogradConvolutionLayer5x1DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer5x1DatasetFp16Subset()
+    {
+        // Batch size 1
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 1U, 3U, 32U), TensorShape(32U), TensorShape(224U, 224U, 32U), PadStrideInfo(1, 1, 2, 0));
+        add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 1U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 0));
+
+        // Batch > 1
         add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 1U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 2, 0));
-        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(5U, 1U, 42U, 100U), TensorShape(100U), TensorShape(177U, 152U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -146,15 +208,12 @@ public:
     LargeWinogradConvolutionLayer7x1Dataset()
     {
         // Batch size 1
-        add_config(TensorShape(224U, 224U, 3U), TensorShape(7U, 1U, 3U, 64U), TensorShape(64U), TensorShape(218U, 224U, 64U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(123U, 134U, 16U), TensorShape(7U, 1U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U), PadStrideInfo(1, 1, 3, 0));
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(7U, 1U, 3U, 32U), TensorShape(32U), TensorShape(218U, 224U, 32U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(181U, 152U, 42U), TensorShape(7U, 1U, 42U, 100U), TensorShape(100U), TensorShape(175U, 152U, 100U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(200U, 201U, 24U), TensorShape(7U, 1U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 3, 0));
 
-        // Batch size 2, 3 and 4
-        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(7U, 1U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U, 2U), PadStrideInfo(1, 1, 3, 0));
+        // Batch > 1
         add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(7U, 1U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 3, 0));
-        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(7U, 1U, 42U, 100U), TensorShape(100U), TensorShape(175U, 152U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -164,15 +223,26 @@ public:
     LargeWinogradConvolutionLayer1x7Dataset()
     {
         // Batch size 1
-        add_config(TensorShape(224U, 224U, 3U), TensorShape(1U, 7U, 3U, 64U), TensorShape(64U), TensorShape(224U, 218U, 64U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(123U, 134U, 16U), TensorShape(1U, 7U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U), PadStrideInfo(1, 1, 0, 3));
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(1U, 7U, 3U, 32U), TensorShape(32U), TensorShape(224U, 218U, 32U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(181U, 152U, 42U), TensorShape(1U, 7U, 42U, 100U), TensorShape(100U), TensorShape(181U, 146U, 100U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(200U, 201U, 24U), TensorShape(1U, 7U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 0, 3));
 
-        // Batch size 2, 3 and 4
-        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(1U, 7U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U, 2U), PadStrideInfo(1, 1, 0, 3));
+        // Batch > 1
+        add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(1U, 7U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 0, 3));
+    }
+};
+
+class LargeWinogradConvolutionLayer1x7DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer1x7DatasetFp16Subset()
+    {
+        // Batch size 1
+        add_config(TensorShape(181U, 152U, 42U), TensorShape(1U, 7U, 42U, 100U), TensorShape(100U), TensorShape(181U, 146U, 100U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(200U, 201U, 24U), TensorShape(1U, 7U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 0, 3));
+
+        // Batch > 1
         add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(1U, 7U, 16U, 7U), TensorShape(7U), TensorShape(123U, 134U, 7U, 3U), PadStrideInfo(1, 1, 0, 3));
-        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(1U, 7U, 42U, 100U), TensorShape(100U), TensorShape(181U, 146U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -182,15 +252,26 @@ public:
     LargeWinogradConvolutionLayer1x5Dataset()
     {
         // Batch size 1
-        add_config(TensorShape(224U, 224U, 3U), TensorShape(1U, 5U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U), PadStrideInfo(1, 1, 0, 2));
-        add_config(TensorShape(123U, 134U, 16U), TensorShape(1U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 130U, 7U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(1U, 5U, 3U, 32U), TensorShape(32U), TensorShape(224U, 224U, 32U), PadStrideInfo(1, 1, 0, 2));
         add_config(TensorShape(181U, 152U, 42U), TensorShape(1U, 5U, 42U, 100U), TensorShape(100U), TensorShape(181U, 148U, 100U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(200U, 201U, 24U), TensorShape(1U, 5U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 0, 2));
 
-        // Batch size 2, 3 and 4
-        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(1U, 5U, 3U, 64U), TensorShape(64U), TensorShape(224U, 224U, 64U, 2U), PadStrideInfo(1, 1, 0, 2));
+        // Batch size > 1
+        add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(1U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 130U, 7U, 3U), PadStrideInfo(1, 1, 0, 0));
+    }
+};
+
+class LargeWinogradConvolutionLayer1x5DatasetFp16Subset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer1x5DatasetFp16Subset()
+    {
+        // Batch size 1
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(1U, 5U, 3U, 32U), TensorShape(32U), TensorShape(224U, 224U, 32U), PadStrideInfo(1, 1, 0, 2));
+        add_config(TensorShape(181U, 152U, 42U), TensorShape(1U, 5U, 42U, 100U), TensorShape(100U), TensorShape(181U, 148U, 100U), PadStrideInfo(1, 1, 0, 0));
+
+        // Batch size > 1
         add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(1U, 5U, 16U, 7U), TensorShape(7U), TensorShape(123U, 130U, 7U, 3U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(1U, 5U, 42U, 100U), TensorShape(100U), TensorShape(181U, 148U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
     }
 };
 
@@ -213,6 +294,16 @@ public:
     }
 };
 
+class VeryLargeConvolutionLayerDataset final : public ConvolutionLayerDataset
+{
+public:
+    VeryLargeConvolutionLayerDataset()
+    {
+        // Tensor size > 1e7 bytes && weight dimensions > 7
+        add_config(TensorShape(336U, 336U, 32U), TensorShape(9U, 9U, 32U, 64U), TensorShape(64U), TensorShape(168U, 168U, 64U), PadStrideInfo(2, 2, 4, 4));
+    }
+};
+
 class LargeGroupedConvolutionLayerDataset final : public ConvolutionLayerDataset
 {
 public:
@@ -233,4 +324,4 @@ public:
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_LARGE_CONVOLUTION_LAYER_DATASET */
+#endif // ACL_TESTS_DATASETS_LARGECONVOLUTIONLAYERDATASET_H
diff --git a/tests/datasets/LargeGEMMDataset.h b/tests/datasets/LargeGEMMDataset.h
index 6cdff7f559..e45319ef57 100644
--- a/tests/datasets/LargeGEMMDataset.h
+++ b/tests/datasets/LargeGEMMDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_LARGE_GEMM_DATASET
-#define ARM_COMPUTE_TEST_LARGE_GEMM_DATASET
+#ifndef ACL_TESTS_DATASETS_LARGEGEMMDATASET_H
+#define ACL_TESTS_DATASETS_LARGEGEMMDATASET_H
 
 #include "tests/datasets/GEMMDataset.h"
 
@@ -79,7 +79,20 @@ public:
         add_config(TensorShape(1729U, 17U, 10U, 3U), TensorShape(128U, 1729U), TensorShape(128U), TensorShape(128U, 17U, 10U, 3U), 1.0f, 0.3f);
     }
 };
+
+class LargeAccumulateGEMMDataset final : public GEMMDataset
+{
+public:
+    LargeAccumulateGEMMDataset()
+    {
+        add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), TensorShape(871U, 429U), 1.0f, 0.0f);
+        add_config(TensorShape(1021U, 1U), TensorShape(783U, 1021U), TensorShape(783U, 1U), TensorShape(783U, 1U), 1.0f, 0.0f);
+        add_config(TensorShape(1021U, 1U), TensorShape(783U, 1021U), TensorShape(783U, 1U), TensorShape(783U, 1U), 1.0f, 0.0f);
+        add_config(TensorShape(941U, 1U), TensorShape(623U, 941U), TensorShape(623U, 1U), TensorShape(623U, 1U), 1.0f, 0.0f);
+    }
+};
+
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_LARGE_GEMM_DATASET */
+#endif // ACL_TESTS_DATASETS_LARGEGEMMDATASET_H
diff --git a/tests/datasets/LargeMatMulDataset.h b/tests/datasets/LargeMatMulDataset.h
new file mode 100644
index 0000000000..8f6c000d37
--- /dev/null
+++ b/tests/datasets/LargeMatMulDataset.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_DATASETS_LARGEMATMULDATASET
+#define ACL_TESTS_DATASETS_LARGEMATMULDATASET
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/datasets/MatMulDataset.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class LargeMatMulDataset final : public MatMulDataset
+{
+public:
+    LargeMatMulDataset()
+    {
+        add_config(TensorShape(21U, 13U, 3U, 2U), TensorShape(33U, 21U, 3U, 2U), TensorShape(33U, 13U, 3U, 2U));
+        add_config(TensorShape(38U, 12U, 1U, 5U), TensorShape(21U, 38U, 1U, 5U), TensorShape(21U, 12U, 1U, 5U));
+        add_config(TensorShape(45U, 38U, 3U, 2U), TensorShape(21U, 45U, 3U, 2U), TensorShape(21U, 38U, 3U, 2U));
+    }
+};
+
+class HighDimensionalMatMulDataset final : public MatMulDataset
+{
+public:
+    HighDimensionalMatMulDataset()
+    {
+        add_config(TensorShape(5U, 5U, 2U, 2U, 2U, 2U), TensorShape(5U, 5U, 2U, 2U, 2U, 2U), TensorShape(5U, 5U, 2U, 2U, 2U, 2U)); // 6D tensor
+    }
+};
+
+class LargeMatMulDatasetRhsExportToCLImageRhsNT final : public MatMulDataset
+{
+public:
+    // For shape choices, please refer to the explanations given in SmallMatMulDatasetRhsExportToCLImageRhsNT
+    LargeMatMulDatasetRhsExportToCLImageRhsNT()
+    {
+        add_config(TensorShape(21U, 13U, 3U, 2U), TensorShape(32U, 21U, 3U, 2U), TensorShape(32U, 13U, 3U, 2U));
+        add_config(TensorShape(38U, 12U, 1U, 5U, 2U), TensorShape(20U, 38U, 1U, 5U, 2U), TensorShape(20U, 12U, 1U, 5U, 2U));
+        add_config(TensorShape(45U, 38U, 3U, 2U, 3U), TensorShape(20U, 45U, 3U, 2U, 3U), TensorShape(20U, 38U, 3U, 2U, 3U));
+    }
+};
+class LargeMatMulDatasetRhsExportToCLImageRhsT final : public MatMulDataset
+{
+public:
+    // For shape choices, please refer to the explanations given in SmallMatMulDatasetRhsExportToCLImageRhsT
+    LargeMatMulDatasetRhsExportToCLImageRhsT()
+    {
+        add_config(TensorShape(28U, 13U, 3U, 2U), TensorShape(32U, 28U, 3U, 2U), TensorShape(32U, 13U, 3U, 2U));
+        add_config(TensorShape(40U, 12U, 1U, 5U, 2U), TensorShape(20U, 40U, 1U, 5U, 2U), TensorShape(20U, 12U, 1U, 5U, 2U));
+        add_config(TensorShape(44U, 38U, 3U, 2U, 3U), TensorShape(20U, 44U, 3U, 2U, 3U), TensorShape(20U, 38U, 3U, 2U, 3U));
+    }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ACL_TESTS_DATASETS_LARGEMATMULDATASET */
diff --git a/tests/datasets/system_tests/yolo/v2/YOLOV2PoolingLayerDataset.h b/tests/datasets/LargeMatMulMMULDataset.h
index 88cac5e449..23e0b3e5c8 100644
--- a/tests/datasets/system_tests/yolo/v2/YOLOV2PoolingLayerDataset.h
+++ b/tests/datasets/LargeMatMulMMULDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_YOLOV2_POOLING_LAYER_DATASET
-#define ARM_COMPUTE_TEST_YOLOV2_POOLING_LAYER_DATASET
 
-#include "tests/datasets/PoolingLayerDataset.h"
-
-#include "utils/TypePrinter.h"
+#ifndef ACL_TESTS_DATASETS_LARGEMATMULMMULDATASET
+#define ACL_TESTS_DATASETS_LARGEMATMULMMULDATASET
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "tests/datasets/MatMulDataset.h"
 
 namespace arm_compute
 {
@@ -37,24 +35,30 @@ namespace test
 {
 namespace datasets
 {
-class YOLOV2PoolingLayerDataset final : public PoolingLayerDataset
+/** MatMul MMUL shapes are similar to MatMul shapes except that K has to be a multiple of MMUL_K0 which is 4 (e.g. see src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp for the definition)
+ */
+class LargeMatMulMMULDataset final : public MatMulDataset
+{
+public:
+    LargeMatMulMMULDataset()
+    {
+        add_config(TensorShape(24U, 13U, 3U, 2U), TensorShape(33U, 24U, 3U, 2U), TensorShape(33U, 13U, 3U, 2U));
+        add_config(TensorShape(36U, 12U, 1U, 5U), TensorShape(21U, 36U, 1U, 5U), TensorShape(21U, 12U, 1U, 5U));
+        add_config(TensorShape(44U, 38U, 3U, 2U), TensorShape(21U, 44U, 3U, 2U), TensorShape(21U, 38U, 3U, 2U));
+    }
+};
+
+class HighDimensionalMatMulMMULDataset final : public MatMulDataset
 {
 public:
-    YOLOV2PoolingLayerDataset()
+    HighDimensionalMatMulMMULDataset()
     {
-        // pool1
-        add_config(TensorShape(416U, 416U, 32U), PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)));
-        // pool2
-        add_config(TensorShape(208U, 208U, 64U), PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)));
-        // pool5
-        add_config(TensorShape(104U, 104U, 128U), PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)));
-        // pool8
-        add_config(TensorShape(52U, 52U, 256U), PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)));
-        // pool13
-        add_config(TensorShape(26U, 26U, 512U), PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)));
+        add_config(TensorShape(4U, 5U, 2U, 2U, 2U, 2U), TensorShape(5U, 4U, 2U, 2U, 2U, 2U), TensorShape(5U, 5U, 2U, 2U, 2U, 2U)); // 6D tensor
     }
 };
+
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_YOLOV2_POOLING_LAYER_DATASET */
+
+#endif /* ACL_TESTS_DATASETS_LARGEMATMULMMULDATASET */
diff --git a/tests/datasets/MatMulDataset.h b/tests/datasets/MatMulDataset.h
new file mode 100644
index 0000000000..9c1c5fb05d
--- /dev/null
+++ b/tests/datasets/MatMulDataset.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_DATASETS_MATMULDATASET
+#define ACL_TESTS_DATASETS_MATMULDATASET
+
+#include "arm_compute/core/TensorShape.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class MatMulDataset
+{
+public:
+    using type = std::tuple<TensorShape, TensorShape, TensorShape>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator a_it,
+                 std::vector<TensorShape>::const_iterator b_it,
+                 std::vector<TensorShape>::const_iterator dst_it)
+            : _a_it{ std::move(a_it) },
+              _b_it{ std::move(b_it) },
+              _dst_it{ std::move(dst_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "A=" << *_a_it << ":";
+            description << "B=" << *_b_it << ":";
+            description << "Out=" << *_dst_it << ":";
+            return description.str();
+        }
+
+        MatMulDataset::type operator*() const
+        {
+            return std::make_tuple(*_a_it, *_b_it, *_dst_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_a_it;
+            ++_b_it;
+            ++_dst_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator _a_it;
+        std::vector<TensorShape>::const_iterator _b_it;
+        std::vector<TensorShape>::const_iterator _dst_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_a_shapes.begin(), _b_shapes.begin(), _dst_shapes.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_a_shapes.size(), std::min(_b_shapes.size(), _dst_shapes.size()));
+    }
+
+    void add_config(TensorShape a, TensorShape b, TensorShape dst)
+    {
+        _a_shapes.emplace_back(std::move(a));
+        _b_shapes.emplace_back(std::move(b));
+        _dst_shapes.emplace_back(std::move(dst));
+    }
+
+protected:
+    MatMulDataset()                 = default;
+    MatMulDataset(MatMulDataset &&) = default;
+
+private:
+    std::vector<TensorShape> _a_shapes{};
+    std::vector<TensorShape> _b_shapes{};
+    std::vector<TensorShape> _dst_shapes{};
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ACL_TESTS_DATASETS_MATMULDATASET */
diff --git a/tests/datasets/MatMulLowpMMULDataset.h b/tests/datasets/MatMulLowpMMULDataset.h
new file mode 100644
index 0000000000..1b22e1061f
--- /dev/null
+++ b/tests/datasets/MatMulLowpMMULDataset.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_TESTS_DATASETS_MATMULLOWPMMULDATASET_H
+#define ACL_TESTS_DATASETS_MATMULLOWPMMULDATASET_H
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/datasets/MatMulDataset.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+/** MatMulLowp MMUL shapes are similar to MatMul MMUL shapes except that K has to be a
+ * multiple of MMUL_K0 which is 16 (e.g. see src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp for the definition)
+ */
+class SmallMatMulLowpMMULDataset final : public MatMulDataset
+{
+public:
+    SmallMatMulLowpMMULDataset()
+    {
+        add_config(TensorShape(16U, 4U), TensorShape(4U, 16U), TensorShape(4U, 4U)); // same as mmul block
+        add_config(TensorShape(96U, 1U), TensorShape(1U, 96U), TensorShape(1U, 1U)); // vector x vector
+        add_config(TensorShape(32U, 4U, 2U), TensorShape(16U, 32U, 2U), TensorShape(16U, 4U, 2U));
+        add_config(TensorShape(48U, 2U), TensorShape(17U, 48U), TensorShape(17U, 2U));
+        add_config(TensorShape(32U, 6U), TensorShape(7U, 32U), TensorShape(7U, 6U));
+    }
+};
+
+// This dataset is for smaller number of tests that will still use small shapes
+// e.g. not repeating everything for QASYMM8 while we're already testing for QASYMM8_SIGNED
+class SmallMatMulLowpMMULDatasetSubset final : public MatMulDataset
+{
+public:
+    SmallMatMulLowpMMULDatasetSubset()
+    {
+        add_config(TensorShape(32U, 4U, 2U), TensorShape(16U, 32U, 2U), TensorShape(16U, 4U, 2U));
+        add_config(TensorShape(32U, 6U), TensorShape(7U, 32U), TensorShape(7U, 6U));
+    }
+};
+
+class SmallMatMulLowpMMULWithBiasDataset final : public MatMulDataset
+{
+public:
+    SmallMatMulLowpMMULWithBiasDataset()
+    {
+        add_config(TensorShape(32U, 4U, 2U, 2U), TensorShape(16U, 32U, 2U, 2U), TensorShape(16U, 4U, 2U, 2U));
+    }
+};
+
+class LargeMatMulLowpMMULDataset final : public MatMulDataset
+{
+public:
+    LargeMatMulLowpMMULDataset()
+    {
+        add_config(TensorShape(192U, 38U, 3U, 2U), TensorShape(21U, 192U, 3U, 2U), TensorShape(21U, 38U, 3U, 2U));
+    }
+};
+
+class HighDimensionalMatMulLowpMMULDataset final : public MatMulDataset
+{
+public:
+    HighDimensionalMatMulLowpMMULDataset()
+    {
+        add_config(TensorShape(16U, 5U, 2U, 2U, 2U, 2U), TensorShape(5U, 16U, 2U, 2U, 2U, 2U), TensorShape(5U, 5U, 2U, 2U, 2U, 2U)); // 6D tensor
+    }
+};
+
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+
+#endif // ACL_TESTS_DATASETS_MATMULLOWPMMULDATASET_H
diff --git a/tests/datasets/Pooling3dLayerDataset.h b/tests/datasets/Pooling3dLayerDataset.h
new file mode 100644
index 0000000000..cfe970e8be
--- /dev/null
+++ b/tests/datasets/Pooling3dLayerDataset.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_POOLING_3D_LAYER_DATASET
+#define ARM_COMPUTE_TEST_POOLING_3D_LAYER_DATASET
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class Pooling3dLayerDataset
+{
+public:
+    using type = std::tuple<TensorShape, Pooling3dLayerInfo>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator        src_it,
+                 std::vector<Pooling3dLayerInfo>::const_iterator infos_it)
+            : _src_it{ std::move(src_it) },
+              _infos_it{ std::move(infos_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "In=" << *_src_it << ":";
+            description << "Info=" << *_infos_it << ":";
+            return description.str();
+        }
+
+        Pooling3dLayerDataset::type operator*() const
+        {
+            return std::make_tuple(*_src_it, *_infos_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_src_it;
+            ++_infos_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator        _src_it;
+        std::vector<Pooling3dLayerInfo>::const_iterator _infos_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_src_shapes.begin(), _infos.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_src_shapes.size(), _infos.size());
+    }
+
+    void add_config(TensorShape src, Pooling3dLayerInfo info)
+    {
+        _src_shapes.emplace_back(std::move(src));
+        _infos.emplace_back(std::move(info));
+    }
+
+protected:
+    Pooling3dLayerDataset()                         = default;
+    Pooling3dLayerDataset(Pooling3dLayerDataset &&) = default;
+
+private:
+    std::vector<TensorShape>        _src_shapes{};
+    std::vector<Pooling3dLayerInfo> _infos{};
+};
+
+// Special pooling dataset
+class Pooling3dLayerDatasetSpecial final : public Pooling3dLayerDataset
+{
+public:
+    Pooling3dLayerDatasetSpecial()
+    {
+        // Special cases
+        add_config(TensorShape(2U, 3U, 4U, 2U, 4U), Pooling3dLayerInfo(PoolingType::AVG, /*pool size*/ Size3D(2, 2, 1), /*pool strides*/ Size3D(3, 3, 1), /*pool padding*/ Padding3D(0, 0, 0), true));
+        add_config(TensorShape(20U, 22U, 10U, 2U), Pooling3dLayerInfo(PoolingType::AVG, Size3D(100, 100, 100), Size3D(5, 5, 5), Padding3D(50, 50, 50), true));
+        add_config(TensorShape(10U, 20U, 32U, 3U, 2U), Pooling3dLayerInfo(PoolingType::MAX, /*pool size*/ 3, /*pool strides*/ Size3D(2, 2, 2), Padding3D(1, 1, 1, 1, 1, 1), false, false,
+                                                                    DimensionRoundingType::FLOOR));
+        add_config(TensorShape(14U, 10U, 10U, 3U, 5U), Pooling3dLayerInfo(PoolingType::AVG,  Size3D(3, 3, 3), /*pool strides*/ Size3D(3, 3, 3), Padding3D(2, 1, 2), true, false, DimensionRoundingType::CEIL));
+        add_config(TensorShape(14U, 10U, 10U, 2U, 4U), Pooling3dLayerInfo(PoolingType::AVG,  Size3D(3, 3, 3), /*pool strides*/ Size3D(3, 3, 3), Padding3D(2, 1, 2), false, false, DimensionRoundingType::CEIL));
+        add_config(TensorShape(15U, 13U, 13U, 3U, 5U), Pooling3dLayerInfo(PoolingType::AVG,  Size3D(4, 4, 4), /*pool strides*/ Size3D(2, 2, 2), Padding3D(2, 2, 2), true, false, DimensionRoundingType::CEIL));
+    }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_POOLING_3D_LAYER_DATASET */
diff --git a/tests/datasets/PoolingLayerDataset.h b/tests/datasets/PoolingLayerDataset.h
index 01b2491eb2..1557240fd2 100644
--- a/tests/datasets/PoolingLayerDataset.h
+++ b/tests/datasets/PoolingLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -106,7 +106,7 @@ public:
     PoolingLayerDatasetSpecial()
     {
         // Special cases
-        add_config(TensorShape(2U, 3U, 4U, 1U), PoolingLayerInfo(PoolingType::AVG, Size2D(3, 3), DataLayout::NCHW, PadStrideInfo(3, 3, 0, 0), true));
+        add_config(TensorShape(2U, 3U, 4U, 1U), PoolingLayerInfo(PoolingType::AVG, Size2D(2, 2), DataLayout::NCHW, PadStrideInfo(3, 3, 0, 0), true));
         add_config(TensorShape(60U, 52U, 3U, 2U), PoolingLayerInfo(PoolingType::AVG, Size2D(100, 100), DataLayout::NCHW, PadStrideInfo(5, 5, 50, 50), true));
         // Asymmetric padding
         add_config(TensorShape(112U, 112U, 32U), PoolingLayerInfo(PoolingType::MAX, 3, DataLayout::NCHW, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR)));
diff --git a/tests/datasets/RandomBatchNormalizationLayerDataset.h b/tests/datasets/RandomBatchNormalizationLayerDataset.h
index 5a49dd702b..4ccb2eaaba 100644
--- a/tests/datasets/RandomBatchNormalizationLayerDataset.h
+++ b/tests/datasets/RandomBatchNormalizationLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,9 +42,9 @@ class SmallRandomBatchNormalizationLayerDataset final : public BatchNormalizatio
 public:
     SmallRandomBatchNormalizationLayerDataset()
     {
-        add_config(TensorShape(15U, 16U, 2U, 12U), TensorShape(2U), 0.1f);
+        add_config(TensorShape(1U, 16U, 2U, 12U), TensorShape(2U), 0.1f);
         add_config(TensorShape(21U, 11U, 12U, 7U), TensorShape(12U), 0.1f);
-        add_config(TensorShape(7U, 3U, 6U, 11U), TensorShape(6U), 0.1f);
+        add_config(TensorShape(32U, 3U, 6U, 11U), TensorShape(6U), 0.1f);
     }
 };
 class LargeRandomBatchNormalizationLayerDataset final : public BatchNormalizationLayerDataset
diff --git a/tests/datasets/ReorderLayerDataset.h b/tests/datasets/ReorderLayerDataset.h
new file mode 100644
index 0000000000..8e1a8422b2
--- /dev/null
+++ b/tests/datasets/ReorderLayerDataset.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_DATASETS_REORDERLAYERDATASET
+#define ACL_TESTS_DATASETS_REORDERLAYERDATASET
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+/** [ReorderLayer datasets] **/
+class ReorderLayerDataset
+{
+public:
+    using type = std::tuple<TensorShape, TensorShape, WeightFormat, WeightFormat>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator  in_it,
+                 std::vector<TensorShape>::const_iterator  out_it,
+                 std::vector<WeightFormat>::const_iterator _wf_in_it,
+                 std::vector<WeightFormat>::const_iterator _wf_out_it)
+            : _in_it{ std::move(in_it) },
+              _out_it{ std::move(out_it) },
+              _wf_in_it{ std::move(_wf_in_it) },
+              _wf_out_it{ std::move(_wf_out_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "In=" << *_in_it << ":";
+            description << "Out=" << *_out_it << ":";
+            description << "Wf_In=" << *_wf_in_it << ":";
+            description << "Wf_Out=" << *_wf_out_it;
+            return description.str();
+        }
+
+        ReorderLayerDataset::type operator*() const
+        {
+            return std::make_tuple(*_in_it, *_out_it, *_wf_in_it, *_wf_out_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_in_it;
+            ++_out_it;
+            ++_wf_in_it;
+            ++_wf_out_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator  _in_it;
+        std::vector<TensorShape>::const_iterator  _out_it;
+        std::vector<WeightFormat>::const_iterator _wf_in_it;
+        std::vector<WeightFormat>::const_iterator _wf_out_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_in_shapes.begin(), _out_shapes.begin(), _in_wfs.begin(), _out_wfs.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_in_shapes.size(), std::min(_out_shapes.size(), std::min(_in_wfs.size(), _out_wfs.size())));
+    }
+
+    void add_config(TensorShape in, TensorShape out, WeightFormat in_wf, WeightFormat out_wf)
+    {
+        _in_shapes.emplace_back(std::move(in));
+        _out_shapes.emplace_back(std::move(out));
+        _in_wfs.emplace_back(std::move(in_wf));
+        _out_wfs.emplace_back(std::move(out_wf));
+    }
+
+    // protected:
+    ReorderLayerDataset()                       = default;
+    ReorderLayerDataset(ReorderLayerDataset &&) = default;
+
+    private:
+    std::vector<TensorShape>  _in_shapes{};
+    std::vector<TensorShape>  _out_shapes{};
+    std::vector<WeightFormat> _in_wfs{};
+    std::vector<WeightFormat> _out_wfs{};
+};
+
+/** [ReorderLayer datasets] **/
+
+class ReorderLayerDatasetBlock4 final : public ReorderLayerDataset
+{
+    public:
+    ReorderLayerDatasetBlock4()
+    {
+        add_config(TensorShape(10U, 9U), TensorShape(10U, 12U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(16U, 16U), TensorShape(16U, 16U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(10U, 511U), TensorShape(10U, 512U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(234U, 301U), TensorShape(234U, 304U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(1024U, 1024U), TensorShape(1024U, 1024U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(10U, 9U, 1U, 1U), TensorShape(10U, 12U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(16U, 16U, 1U, 1U), TensorShape(16U, 16U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(10U, 511U, 1U, 1U), TensorShape(10U, 512U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(234U, 301U, 1U, 1U), TensorShape(234U, 304U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+        add_config(TensorShape(1024U, 1024U, 1U, 1U), TensorShape(1024U, 1024U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo4);
+    }
+};
+
+class ReorderLayerDatasetBlock8 final : public ReorderLayerDataset
+{
+    public:
+    ReorderLayerDatasetBlock8()
+    {
+        add_config(TensorShape(10U, 9U), TensorShape(10U, 16U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(16U, 16U), TensorShape(16U, 16U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(10U, 511U), TensorShape(10U, 512U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(234U, 301U), TensorShape(234U, 304U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(1024U, 1024U), TensorShape(1024U, 1024U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(10U, 9U, 1U, 1U), TensorShape(10U, 16U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(16U, 16U, 1U, 1U), TensorShape(16U, 16U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(10U, 511U, 1U, 1U), TensorShape(10U, 512U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(234U, 301U, 1U, 1U), TensorShape(234U, 304U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+        add_config(TensorShape(1024U, 1024U, 1U, 1U), TensorShape(1024U, 1024U, 1U, 1U), WeightFormat::OHWI, WeightFormat::OHWIo8);
+    }
+};
+
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ACL_TESTS_DATASETS_REORDERLAYERDATASET */
diff --git a/tests/datasets/ReshapeLayerDataset.h b/tests/datasets/ReshapeLayerDataset.h
index d1a1667683..015f9157aa 100644
--- a/tests/datasets/ReshapeLayerDataset.h
+++ b/tests/datasets/ReshapeLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_RESHAPE_LAYER_DATASET
-#define ARM_COMPUTE_TEST_RESHAPE_LAYER_DATASET
+#ifndef ACL_TESTS_DATASETS_RESHAPELAYERDATASET_H
+#define ACL_TESTS_DATASETS_RESHAPELAYERDATASET_H
 
 #include "utils/TypePrinter.h"
 
@@ -111,9 +111,10 @@ public:
         add_config(TensorShape(17U, 3U, 12U), TensorShape(1U, 1U, 612U));
         add_config(TensorShape(26U, 26U, 32U), TensorShape(13U, 13U, 128U));
         add_config(TensorShape(31U, 23U, 4U, 7U), TensorShape(2U, 14U, 713U));
+        add_config(TensorShape(8U, 8U, 8U), TensorShape(8U, 64U));
     }
 };
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_RESHAPE_LAYER_DATASET */
+#endif // ACL_TESTS_DATASETS_RESHAPELAYERDATASET_H
diff --git a/tests/datasets/ScaleValidationDataset.h b/tests/datasets/ScaleValidationDataset.h
index 25112f155f..8987c3a1c1 100644
--- a/tests/datasets/ScaleValidationDataset.h
+++ b/tests/datasets/ScaleValidationDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_SCALE_VALIDATION_DATASET
-#define ARM_COMPUTE_TEST_SCALE_VALIDATION_DATASET
+#ifndef TESTS_DATASETS_SCALEVALIDATIONDATASET
+#define TESTS_DATASETS_SCALEVALIDATIONDATASET
 
-#include "utils/TypePrinter.h"
-
-#include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/InterpolationPolicyDataset.h"
 #include "tests/datasets/SamplingPolicyDataset.h"
 #include "tests/datasets/ShapeDatasets.h"
 
@@ -140,18 +136,16 @@ const auto ScaleAlignCornersSamplingPolicySet = combine(framework::dataset::make
 }),
 framework::dataset::make("AlignCorners", { true }));
 
-/** Generated shapes: Used by Neon precommit and nightly
+/** Generated shapes: used by precommit and nightly for CPU tests
  * - 2D shapes with 0, 1, 2 vector iterations
  * - 3D shapes with 0, 1 vector iterations
  * - 4D shapes with 0 vector iterations
  */
-#define SCALE_SHAPE_DATASET(element_per_iteration)                                                  \
-    concat(concat(concat(concat(concat(ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 0>(),  \
-                                       ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 1>()), \
-                                ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 2>()),        \
-                         ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 0>()),               \
-                  ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 1>()),                      \
-           ScaleShapesBaseDataSet<3, 3, (element_per_iteration), 0>())
+#define SCALE_SHAPE_DATASET(element_per_iteration)                                    \
+    concat(concat(concat(ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 0>(),  \
+                         ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 2>()), \
+                  ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 1>()),        \
+           ScaleShapesBaseDataSet<40, 3, (element_per_iteration), 0>())
 
 // To prevent long precommit time for OpenCL, shape set for OpenCL is separated into below two parts.
 /** Generated shapes for precommits to achieve essential coverage. Used by CL precommit and nightly
@@ -166,20 +160,36 @@ framework::dataset::make("AlignCorners", { true }));
  * - 3D shapes with 0 vector iterations (1 vector iteration is covered by SCALE_PRECOMMIT_SHAPE_DATASET)
  * - 4D shapes with 0 vector iterations
  */
-#define SCALE_NIGHTLY_SHAPE_DATASET(element_per_iteration)                                   \
-    concat(concat(concat(concat(ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 0>(),  \
-                                ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 1>()), \
-                         ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 2>()),        \
-                  ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 0>()),               \
+#define SCALE_NIGHTLY_SHAPE_DATASET(element_per_iteration)                            \
+    concat(concat(concat(ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 0>(),  \
+                         ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 1>()), \
+                  ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 0>()),        \
            ScaleShapesBaseDataSet<3, 3, (element_per_iteration), 0>())
 
-/** Generating dataset for non-quantized data tyeps with the given shapes */
+/** Generating dataset for non-quantized data types with the given shapes */
 #define ASSEMBLE_DATASET(shape, samping_policy_set)             \
     combine(combine(combine(combine((shape), ScaleDataLayouts), \
                             ScaleInterpolationPolicySet),       \
                     datasets::BorderModes()),                   \
             samping_policy_set)
 
+#define ASSEMBLE_DATASET_DYNAMIC_FUSION(shape, samping_policy_set)                                  \
+    combine(combine(combine((shape), framework::dataset::make("DataLayout", { DataLayout::NHWC })), \
+                    ScaleInterpolationPolicySet),                                                   \
+            samping_policy_set)
+
+#define ASSEMBLE_S8_DATASET(shape, samping_policy_set)                                                           \
+    combine(combine(combine(combine((shape), framework::dataset::make("DataLayout", DataLayout::NHWC)),          \
+                            framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::BILINEAR })), \
+                    framework::dataset::make("BorderMode", { BorderMode::REPLICATE })),                          \
+            samping_policy_set)
+
+#define ASSEMBLE_NHWC_DATASET(shape, samping_policy_set)                                                      \
+    combine(combine(combine(combine((shape), framework::dataset::make("DataLayout", DataLayout::NHWC)),       \
+                            ScaleInterpolationPolicySet),                                                     \
+                    framework::dataset::make("BorderMode", { BorderMode::CONSTANT, BorderMode::REPLICATE })), \
+            samping_policy_set)
+
 /** Generating dataset for quantized data tyeps with the given shapes */
 #define ASSEMBLE_QUANTIZED_DATASET(shape, sampling_policy_set, quantization_info_set) \
     combine(combine(combine(combine(combine(shape,                                    \
@@ -189,7 +199,24 @@ framework::dataset::make("AlignCorners", { true }));
                     datasets::BorderModes()),                                         \
             sampling_policy_set)
 
+#define ASSEMBLE_QUANTIZED_DATASET_DYNAMIC_FUSION(shape, sampling_policy_set, quantization_info_set) \
+    combine(combine(combine(combine(shape,                                                           \
+                                    quantization_info_set),                                          \
+                            framework::dataset::make("DataLayout", { DataLayout::NHWC })),           \
+                    ScaleInterpolationPolicySet),                                                    \
+            sampling_policy_set)
+
+/** Generating dataset for quantized data tyeps with the given shapes */
+#define ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(shape, sampling_policy_set, input_quant_info_set, output_quant_info_set) \
+    combine(combine(combine(combine(combine(combine(shape,                                                              \
+                                                    input_quant_info_set),                                              \
+                                            output_quant_info_set),                                                     \
+                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })),                      \
+                            framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::BILINEAR })),        \
+                    framework::dataset::make("BorderMode", { BorderMode::REPLICATE })),                                 \
+            sampling_policy_set)
+
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SCALE_VALIDATION_DATASET */
+#endif /* TESTS_DATASETS_SCALEVALIDATIONDATASET */
diff --git a/tests/datasets/ScatterDataset.h b/tests/datasets/ScatterDataset.h
new file mode 100644
index 0000000000..8fd4448d2d
--- /dev/null
+++ b/tests/datasets/ScatterDataset.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_DATASETS_SCATTERDATASET_H
+#define ACL_TESTS_DATASETS_SCATTERDATASET_H
+
+#include "arm_compute/core/TensorShape.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+
+class ScatterDataset
+{
+public:
+    using type = std::tuple<TensorShape, TensorShape, TensorShape, TensorShape>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator src_it,
+                 std::vector<TensorShape>::const_iterator updates_it,
+                 std::vector<TensorShape>::const_iterator indices_it,
+                 std::vector<TensorShape>::const_iterator dst_it)
+            : _src_it{ std::move(src_it) },
+              _updates_it{ std::move(updates_it) },
+              _indices_it{std::move(indices_it)},
+              _dst_it{ std::move(dst_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "A=" << *_src_it << ":";
+            description << "B=" << *_updates_it << ":";
+            description << "C=" << *_indices_it << ":";
+            description << "Out=" << *_dst_it << ":";
+            return description.str();
+        }
+
+        ScatterDataset::type operator*() const
+        {
+            return std::make_tuple(*_src_it, *_updates_it, *_indices_it, *_dst_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_src_it;
+            ++_updates_it;
+            ++_indices_it;
+            ++_dst_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator _src_it;
+        std::vector<TensorShape>::const_iterator _updates_it;
+        std::vector<TensorShape>::const_iterator _indices_it;
+        std::vector<TensorShape>::const_iterator _dst_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_src_shapes.begin(), _update_shapes.begin(), _indices_shapes.begin(), _dst_shapes.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_src_shapes.size(), std::min(_indices_shapes.size(), std::min(_update_shapes.size(), _dst_shapes.size())));
+    }
+
+    void add_config(TensorShape a, TensorShape b, TensorShape c, TensorShape dst)
+    {
+        _src_shapes.emplace_back(std::move(a));
+        _update_shapes.emplace_back(std::move(b));
+        _indices_shapes.emplace_back(std::move(c));
+        _dst_shapes.emplace_back(std::move(dst));
+    }
+
+protected:
+    ScatterDataset()                 = default;
+    ScatterDataset(ScatterDataset &&) = default;
+
+private:
+    std::vector<TensorShape> _src_shapes{};
+    std::vector<TensorShape> _update_shapes{};
+    std::vector<TensorShape> _indices_shapes{};
+    std::vector<TensorShape> _dst_shapes{};
+};
+
+
+// 1D dataset for simple scatter tests.
+class Small1DScatterDataset final : public ScatterDataset
+{
+public:
+    Small1DScatterDataset()
+    {
+        add_config(TensorShape(6U), TensorShape(6U), TensorShape(1U, 6U), TensorShape(6U));
+        add_config(TensorShape(10U), TensorShape(2U), TensorShape(1U, 2U), TensorShape(10U));
+    }
+};
+
+// This dataset represents the (m+1)-D updates/dst case.
+class SmallScatterMultiDimDataset final : public ScatterDataset
+{
+public:
+    SmallScatterMultiDimDataset()
+    {
+        // NOTE: Config is src, updates, indices, output.
+        //      - In this config, the dim replaced is the final number (largest tensor dimension)
+        //      - Largest "updates" dim should match y-dim of indices.
+        //      - src/updates/dst should all have same number of dims. Indices should be 2D.
+        add_config(TensorShape(6U, 5U), TensorShape(6U, 2U), TensorShape(1U, 2U), TensorShape(6U, 5U));
+        add_config(TensorShape(9U, 3U, 4U), TensorShape(9U, 3U, 2U), TensorShape(1U, 2U), TensorShape(9U, 3U, 4U));
+        add_config(TensorShape(17U, 3U, 2U, 4U), TensorShape(17U, 3U, 2U, 7U), TensorShape(1U, 7U), TensorShape(17U, 3U, 2U, 4U));
+    }
+};
+
+// This dataset represents the (m+1)-D updates tensor, (m+n)-d output tensor cases
+class SmallScatterMultiIndicesDataset final : public ScatterDataset
+{
+public:
+    SmallScatterMultiIndicesDataset()
+    {
+        // NOTE: Config is src, updates, indices, output.
+        // NOTE: indices.shape.x = src.num_dimensions - updates.num_dimensions + 1
+
+        // index length is 2
+        add_config(TensorShape(6U, 5U, 2U), TensorShape(6U, 4U), TensorShape(2U, 4U), TensorShape(6U, 5U, 2U));
+        add_config(TensorShape(17U, 3U, 3U, 2U), TensorShape(17U, 3U, 2U), TensorShape(2U, 2U), TensorShape(17U, 3U, 3U, 2U));
+        add_config(TensorShape(11U, 3U, 3U, 2U, 4U), TensorShape(11U, 3U, 3U, 4U), TensorShape(2U, 4U), TensorShape(11U, 3U, 3U, 2U, 4U));
+        add_config(TensorShape(5U, 4U, 3U, 3U, 2U, 4U), TensorShape(5U, 4U, 3U, 3U, 5U), TensorShape(2U, 5U), TensorShape(5U, 4U, 3U, 3U, 2U, 4U));
+
+        // index length is 3
+        add_config(TensorShape(4U, 3U, 2U, 2U), TensorShape(4U, 2U), TensorShape(3U, 2U), TensorShape(4U, 3U, 2U, 2U));
+        add_config(TensorShape(17U, 4U, 3U, 2U, 2U), TensorShape(17U, 4U, 4U), TensorShape(3U, 4U), TensorShape(17U, 4U, 3U, 2U, 2U));
+        add_config(TensorShape(10U, 4U, 5U, 3U, 2U, 2U), TensorShape(10U, 4U, 5U, 3U), TensorShape(3U, 3U), TensorShape(10U, 4U, 5U, 3U, 2U, 2U));
+
+        // index length is 4
+        add_config(TensorShape(35U, 4U, 3U, 2U, 2U), TensorShape(35U, 4U), TensorShape(4U, 4U), TensorShape(35U, 4U, 3U, 2U, 2U));
+        add_config(TensorShape(10U, 4U, 5U, 3U, 2U, 2U), TensorShape(10U, 4U, 3U), TensorShape(4U, 3U), TensorShape(10U, 4U, 5U, 3U, 2U, 2U));
+
+        // index length is 5
+        add_config(TensorShape(10U, 4U, 5U, 3U, 2U, 2U), TensorShape(10U, 3U), TensorShape(5U, 3U), TensorShape(10U, 4U, 5U, 3U, 2U, 2U));
+    }
+};
+
+// This dataset represents the (m+k)-D updates tensor, (k+1)-d indices tensor and (m+n)-d output tensor cases
+class SmallScatterBatchedDataset final : public ScatterDataset
+{
+public:
+    SmallScatterBatchedDataset()
+    {
+        // NOTE: Config is src, updates, indices, output.
+        // NOTE: Updates/Indices tensors are now batched.
+        // NOTE: indices.shape.x = (updates_batched) ? (src.num_dimensions - updates.num_dimensions) + 2 : (src.num_dimensions - updates.num_dimensions) + 1
+        // k is the number of batch dimensions
+        // k = 2
+        add_config(TensorShape(6U, 5U), TensorShape(6U, 2U, 2U), TensorShape(1U, 2U, 2U), TensorShape(6U, 5U));
+        add_config(TensorShape(5U, 5U, 4U, 2U, 2U), TensorShape(5U, 5U, 6U, 2U), TensorShape(3U, 6U, 2U), TensorShape(5U, 5U, 4U, 2U, 2U));
+
+        // k = 3
+        add_config(TensorShape(6U, 5U), TensorShape(6U, 2U, 2U, 2U), TensorShape(1U, 2U, 2U, 2U), TensorShape(6U, 5U));
+        add_config(TensorShape(5U, 5U, 4U, 2U, 2U), TensorShape(5U, 5U, 3U, 6U, 2U), TensorShape(3U, 3U, 6U, 2U), TensorShape(5U, 5U, 4U, 2U, 2U));
+
+        // k = 4
+        add_config(TensorShape(5U, 5U, 4U, 2U, 2U), TensorShape(5U, 6U, 2U, 3U, 2U), TensorShape(4U, 6U, 2U, 3U, 2U), TensorShape(5U, 5U, 4U, 2U, 2U));
+
+        // k = 5
+        add_config(TensorShape(5U, 5U, 4U, 2U, 2U), TensorShape(5U, 3U, 4U, 3U, 2U, 2U), TensorShape(4U, 3U, 4U, 3U, 2U, 2U), TensorShape(5U, 5U, 4U, 2U, 2U));
+    }
+};
+
+class SmallScatterScalarDataset final : public ScatterDataset
+{
+public:
+    // batched scalar case
+    SmallScatterScalarDataset()
+    {
+        add_config(TensorShape(6U, 5U), TensorShape(6U), TensorShape(2U, 6U), TensorShape(6U, 5U));
+        add_config(TensorShape(6U, 5U), TensorShape(6U, 6U), TensorShape(2U, 6U, 6U), TensorShape(6U, 5U));
+        add_config(TensorShape(3U, 3U, 6U, 5U), TensorShape(6U, 6U), TensorShape(4U, 6U, 6U), TensorShape(3U, 3U, 6U, 5U));
+    }
+};
+
+// This dataset is for data types that does not require full testing. It contains selected tests from the above.
+class SmallScatterMixedDataset final : public ScatterDataset
+{
+public:
+    SmallScatterMixedDataset()
+    {
+        add_config(TensorShape(10U), TensorShape(2U), TensorShape(1U, 2U), TensorShape(10U));
+        add_config(TensorShape(9U, 3U, 4U), TensorShape(9U, 3U, 2U), TensorShape(1U, 2U), TensorShape(9U, 3U, 4U));
+        add_config(TensorShape(6U, 5U), TensorShape(6U, 6U), TensorShape(2U, 6U, 6U), TensorShape(6U, 5U));
+        add_config(TensorShape(35U, 4U, 3U, 2U, 2U), TensorShape(35U, 4U), TensorShape(4U, 4U), TensorShape(35U, 4U, 3U, 2U, 2U));
+        add_config(TensorShape(11U, 3U, 3U, 2U, 4U), TensorShape(11U, 3U, 3U, 4U), TensorShape(2U, 4U), TensorShape(11U, 3U, 3U, 2U, 4U));
+        add_config(TensorShape(6U, 5U, 2U), TensorShape(6U, 2U, 2U), TensorShape(2U, 2U, 2U), TensorShape(6U, 5U, 2U));
+    }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_DATASETS_SCATTERDATASET_H
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index a7f1a44286..c1e61444a8 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,9 +88,9 @@ public:
     Small2DShapes()
         : ShapeDataset("Shape",
     {
-        TensorShape{ 7U, 7U },
-                     TensorShape{ 27U, 13U },
-                     TensorShape{ 128U, 64U }
+        TensorShape{ 1U, 7U },
+                     TensorShape{ 5U, 13U },
+                     TensorShape{ 32U, 64U }
     })
     {
     }
@@ -135,7 +135,7 @@ public:
     Tiny4DShapes()
         : ShapeDataset("Shape",
     {
-        TensorShape{ 7U, 7U, 5U, 3U },
+        TensorShape{ 2U, 7U, 5U, 3U },
                      TensorShape{ 17U, 13U, 7U, 2U },
     })
     {
@@ -165,12 +165,32 @@ public:
         : ShapeDataset("Shape",
     {
         // Batch size 1
-        TensorShape{ 9U, 9U },
+        TensorShape{ 1U, 9U },
                      TensorShape{ 27U, 13U, 2U },
     })
     {
     }
 };
+/** Data set containing small tensor shapes with none of the dimensions equal to 1 (unit). */
+class SmallNoneUnitShapes final : public ShapeDataset
+{
+public:
+    SmallNoneUnitShapes()
+        : ShapeDataset("Shape",
+    {
+        // Batch size 1
+        TensorShape{ 13U, 11U },
+                     TensorShape{ 16U, 16U },
+                     TensorShape{ 24U, 26U, 5U },
+                     TensorShape{ 7U, 7U, 17U, 2U },
+                     // Batch size 4
+                     TensorShape{ 27U, 13U, 2U, 4U },
+                     // Arbitrary batch size
+                     TensorShape{ 8U, 7U, 5U, 5U }
+    })
+    {
+    }
+};
 /** Data set containing small tensor shapes. */
 class SmallShapes final : public ShapeDataset
 {
@@ -179,12 +199,12 @@ public:
         : ShapeDataset("Shape",
     {
         // Batch size 1
-        TensorShape{ 11U, 11U },
-                     TensorShape{ 16U, 16U },
+        TensorShape{ 3U, 11U },
+                     TensorShape{ 1U, 16U },
                      TensorShape{ 27U, 13U, 7U },
                      TensorShape{ 7U, 7U, 17U, 2U },
-                     // Batch size 4
-                     TensorShape{ 27U, 13U, 2U, 4U },
+                     // Batch size 4 and 2 SIMD iterations
+                     TensorShape{ 33U, 13U, 2U, 4U },
                      // Arbitrary batch size
                      TensorShape{ 11U, 11U, 3U, 5U }
     })
@@ -192,6 +212,25 @@ public:
     }
 };
 
+/** Data set containing small tensor shapes. */
+class SmallShapesNoBatches final : public ShapeDataset
+{
+public:
+    SmallShapesNoBatches()
+        : ShapeDataset("Shape",
+    {
+        // Batch size 1
+        TensorShape{ 3U, 11U },
+                     TensorShape{ 1U, 16U },
+                     TensorShape{ 27U, 13U, 7U },
+                     TensorShape{ 7U, 7U, 17U },
+                     TensorShape{ 33U, 13U, 2U },
+                     TensorShape{ 11U, 11U, 3U }
+    })
+    {
+    }
+};
+
 /** Data set containing pairs of tiny tensor shapes that are broadcast compatible. */
 class TinyShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
 {
@@ -211,6 +250,25 @@ public:
     {
     }
 };
+/** Data set containing pairs of tiny tensor shapes that are broadcast compatible and can do in_place calculation. */
+class TinyShapesBroadcastInplace final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
+{
+public:
+    TinyShapesBroadcastInplace()
+        : ZipDataset<ShapeDataset, ShapeDataset>(
+              ShapeDataset("Shape0",
+    {
+        TensorShape{ 9U },
+                     TensorShape{ 10U, 2U, 14U, 2U },
+    }),
+    ShapeDataset("Shape1",
+    {
+        TensorShape{ 9U, 1U, 9U },
+        TensorShape{ 10U },
+    }))
+    {
+    }
+};
 /** Data set containing pairs of small tensor shapes that are broadcast compatible. */
 class SmallShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
 {
@@ -243,6 +301,52 @@ public:
     }
 };
 
+class TemporaryLimitedSmallShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
+{
+public:
+    TemporaryLimitedSmallShapesBroadcast()
+        : ZipDataset<ShapeDataset, ShapeDataset>(
+              ShapeDataset("Shape0",
+    {
+        TensorShape{ 1U, 3U, 4U, 2U },  // LHS broadcast X
+        TensorShape{ 6U, 4U, 2U, 3U },  // RHS broadcast X
+        TensorShape{ 7U, 1U, 1U, 4U },  // LHS broadcast Y, Z
+        TensorShape{ 8U, 5U, 6U, 3U },  // RHS broadcast Y, Z
+        TensorShape{ 1U, 1U, 1U, 2U },  // LHS broadcast X, Y, Z
+        TensorShape{ 2U, 6U, 4U, 3U },  // RHS broadcast X, Y, Z
+    }),
+    ShapeDataset("Shape1",
+    {
+        TensorShape{ 5U, 3U, 4U, 2U },
+        TensorShape{ 1U, 4U, 2U, 3U },
+        TensorShape{ 7U, 2U, 3U, 4U },
+        TensorShape{ 8U, 1U, 1U, 3U },
+        TensorShape{ 4U, 7U, 3U, 2U },
+        TensorShape{ 1U, 1U, 1U, 3U },
+    }))
+    {
+    }
+};
+
+class TemporaryLimitedLargeShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
+{
+public:
+    TemporaryLimitedLargeShapesBroadcast()
+        : ZipDataset<ShapeDataset, ShapeDataset>(
+              ShapeDataset("Shape0",
+    {
+        TensorShape{ 127U, 25U, 5U },
+                     TensorShape{ 485, 40U, 10U }
+    }),
+    ShapeDataset("Shape1",
+    {
+        TensorShape{ 1U, 1U, 1U },   // Broadcast in X, Y, Z
+        TensorShape{ 485U, 1U, 1U }, // Broadcast in Y, Z
+    }))
+    {
+    }
+};
+
 /** Data set containing medium tensor shapes. */
 class MediumShapes final : public ShapeDataset
 {
@@ -320,6 +424,19 @@ public:
     }
 };
 
+/** Data set containing large tensor shapes. */
+class LargeShapesNoBatches final : public ShapeDataset
+{
+public:
+    LargeShapesNoBatches()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 582U, 131U, 2U },
+    })
+    {
+    }
+};
+
 /** Data set containing pairs of large tensor shapes that are broadcast compatible. */
 class LargeShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
 {
@@ -501,6 +618,21 @@ public:
     }
 };
 
+/** Data set containing small 5D tensor shapes. */
+class Small5dShapes final : public ShapeDataset
+{
+public:
+    Small5dShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 5U, 5U, 7U, 4U, 3U },
+                     TensorShape{ 5U, 5U, 4U, 13U, 2U },
+                     TensorShape{ 5U, 5U, 3U, 5U, 2U },
+    })
+    {
+    }
+};
+
 /** Data set containing large 5x5 tensor shapes. */
 class Large5x5Shapes final : public ShapeDataset
 {
@@ -514,6 +646,19 @@ public:
     }
 };
 
+/** Data set containing large 5D tensor shapes. */
+class Large5dShapes final : public ShapeDataset
+{
+public:
+    Large5dShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 30U, 40U, 30U, 32U, 3U }
+    })
+    {
+    }
+};
+
 /** Data set containing small 5x1 tensor shapes. */
 class Small5x1Shapes final : public ShapeDataset
 {
@@ -651,6 +796,7 @@ public:
     SmallDeconvolutionShapes()
         : ShapeDataset("InputShape",
     {
+        // Multiple Vector Loops for FP32
         TensorShape{ 5U, 4U, 3U, 2U },
                      TensorShape{ 5U, 5U, 3U },
                      TensorShape{ 11U, 13U, 4U, 3U }
@@ -659,6 +805,19 @@ public:
     }
 };
 
+class SmallDeconvolutionShapesWithLargerChannels final : public ShapeDataset
+{
+public:
+    SmallDeconvolutionShapesWithLargerChannels()
+        : ShapeDataset("InputShape",
+    {
+        // Multiple Vector Loops for all data types
+        TensorShape{ 5U, 5U, 35U }
+    })
+    {
+    }
+};
+
 /** Data set containing tiny tensor shapes for direct convolution. */
 class TinyDirectConvolutionShapes final : public ShapeDataset
 {
@@ -689,6 +848,23 @@ public:
     }
 };
 
+class SmallDirectConv3DShapes final : public ShapeDataset
+{
+public:
+    SmallDirectConv3DShapes()
+        : ShapeDataset("InputShape",
+    {
+        // Batch size 2
+        TensorShape{ 1U, 3U, 4U, 5U, 2U },
+                     // Batch size 3
+                     TensorShape{ 7U, 27U, 3U, 6U, 3U },
+                     // Batch size 1
+                     TensorShape{ 32U, 37U, 13U, 1U, 1U },
+    })
+    {
+    }
+};
+
 /** Data set containing small tensor shapes for direct convolution. */
 class SmallDirectConvolutionTensorShiftShapes final : public ShapeDataset
 {
@@ -839,7 +1015,7 @@ public:
     SoftmaxLayerSmallShapes()
         : ShapeDataset("Shape",
     {
-        TensorShape{ 9U, 9U },
+        TensorShape{ 1U, 9U },
                      TensorShape{ 256U, 10U },
                      TensorShape{ 353U, 8U },
                      TensorShape{ 781U, 5U },
diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index 66640dd943..67eade1e64 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -181,7 +181,17 @@ public:
     }
 };
 
-// TODO (COMPMID-1749)
+class SmallConvolutionLayerPrePaddingDataset final : public ConvolutionLayerDataset
+{
+public:
+    SmallConvolutionLayerPrePaddingDataset()
+    {
+        // output shape is calculated by accounting pre-padding layer as well -- all the data is in nchw
+        add_config(TensorShape(17U, 31U, 2U), TensorShape(5U, 5U, 2U, 19U), TensorShape(19U), TensorShape(17U, 16U, 19U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 5U, 7U, 16U), TensorShape(16U), TensorShape(12U, 13U, 16U), PadStrideInfo(3, 2, 2, 0));
+    }
+};
+
 class SmallConvolutionLayerReducedDataset final : public ConvolutionLayerDataset
 {
 public:
diff --git a/tests/datasets/SmallGEMMDataset.h b/tests/datasets/SmallGEMMDataset.h
index 7d2b42a0d6..99c7abbf64 100644
--- a/tests/datasets/SmallGEMMDataset.h
+++ b/tests/datasets/SmallGEMMDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_SMALL_GEMM_DATASET
-#define ARM_COMPUTE_TEST_SMALL_GEMM_DATASET
+#ifndef ACL_TESTS_DATASETS_SMALLGEMMDATASET_H
+#define ACL_TESTS_DATASETS_SMALLGEMMDATASET_H
 
 #include "tests/datasets/GEMMDataset.h"
 
@@ -50,6 +50,7 @@ public:
         add_config(TensorShape(32U, 1U), TensorShape(17U, 32U), TensorShape(17U, 1U), TensorShape(17U, 1U), 0.4f, 0.7f);
     }
 };
+
 class SmallGEMMOutput3DDataset final : public GEMMDataset
 {
 public:
@@ -77,7 +78,37 @@ public:
         add_config(TensorShape(16U, 16U, 5U, 3U), TensorShape(8U, 16U), TensorShape(8U), TensorShape(8U, 16U, 5U, 3U), 1.0f, 0.3f);
     }
 };
+
+class SmallBatchedMatMulDataset final : public GEMMDataset
+{
+public:
+    SmallBatchedMatMulDataset()
+    {
+        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U), TensorShape(2U, 3U), 1.0f, 0.0f);
+        add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U), TensorShape(7U, 15U), 1.0f, 0.0f);
+        add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U), TensorShape(36U, 17U), 1.0f, 0.0f);
+        add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U), TensorShape(5U, 4U, 3U), 1.0f, 0.0f);
+        add_config(TensorShape(15U, 7U, 36U), TensorShape(29U, 15U, 36U), TensorShape(29U), TensorShape(29U, 7U, 36U), 1.0f, 0.0f);
+        add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U), TensorShape(5U, 17U, 32U), 1.0f, 0.0f);
+        add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U), TensorShape(19U, 256U, 32U), 1.0f, 0.0f);
+        // Broadcast in RHS's batch dimension
+        add_config(TensorShape(15U, 7U, 36U), TensorShape(29U, 15U), TensorShape(29U), TensorShape(29U, 7U, 36U), 1.0f, 0.0f);
+        add_config(TensorShape(15U, 7U, 36U, 2U), TensorShape(29U, 15U), TensorShape(29U), TensorShape(29U, 7U, 36U, 2U), 1.0f, 0.0f);
+    }
+};
+
+class SmallAccumulateGEMMDataset final : public GEMMDataset
+{
+public:
+    SmallAccumulateGEMMDataset()
+    {
+        add_config(TensorShape(8U, 2U), TensorShape(16U, 8U), TensorShape(16U, 2U), TensorShape(16U, 2U), 1.0f, 0.0f);
+        add_config(TensorShape(31U, 1U), TensorShape(23U, 31U), TensorShape(23U, 1U), TensorShape(23U, 1U), 1.0f, 0.0f);
+        add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), TensorShape(33U, 13U), 1.0f, 0.0f);
+    }
+};
+
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SMALL_GEMM_DATASET */
+#endif // ACL_TESTS_DATASETS_SMALLGEMMDATASET_H
diff --git a/tests/datasets/SmallGEMMLowpDataset.h b/tests/datasets/SmallGEMMLowpDataset.h
index 1b6c65307b..929940d2d9 100644
--- a/tests/datasets/SmallGEMMLowpDataset.h
+++ b/tests/datasets/SmallGEMMLowpDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,11 +58,10 @@ public:
     SmallGEMMLowpOutput3DDataset()
     {
         add_config(TensorShape(21U, 14U), TensorShape(34U, 21U), TensorShape(34U, 7U, 2U), 0, 0);
-        add_config(TensorShape(31U, 1U), TensorShape(23U, 31U), TensorShape(23U, 1U, 1U), -2, 13);
-        add_config(TensorShape(38U, 12U), TensorShape(21U, 38U), TensorShape(21U, 4U, 3U), 0, 4);
-        add_config(TensorShape(32U, 1U), TensorShape(17U, 32U), TensorShape(17U, 1U, 1U), -2, 1);
-        add_config(TensorShape(16U, 16U), TensorShape(8U, 16U), TensorShape(8U, 8U, 2U), 5, 9);
-        add_config(TensorShape(16U, 16U, 5U), TensorShape(8U, 16U, 5U), TensorShape(8U, 8U, 2U, 5U), -7, 2);
+        add_config(TensorShape(31U, 1U), TensorShape(3U, 31U), TensorShape(3U, 1U, 1U), -2, 13);
+        add_config(TensorShape(38U, 12U), TensorShape(1U, 38U), TensorShape(1U, 4U, 3U), 0, 4);
+        add_config(TensorShape(16U, 16U), TensorShape(11U, 16U), TensorShape(11U, 8U, 2U), 2, -1);
+        add_config(TensorShape(16U, 16U, 5U), TensorShape(13U, 16U, 5U), TensorShape(13U, 8U, 2U, 5U), -3, 2);
     }
 };
 class SmallGEMMLowpInputOutput3DDataset final : public GEMMLowpDataset
@@ -71,13 +70,28 @@ public:
     SmallGEMMLowpInputOutput3DDataset()
     {
         add_config(TensorShape(21U, 14U, 13U), TensorShape(34U, 21U), TensorShape(34U, 14U, 13U), 0, 0);
-        add_config(TensorShape(31U, 1U, 3U), TensorShape(23U, 31U), TensorShape(23U, 1U, 3U), 0, 0);
+        add_config(TensorShape(31U, 1U, 3U), TensorShape(1U, 31U), TensorShape(1U, 1U, 3U), 0, 0);
         add_config(TensorShape(38U, 12U, 2U), TensorShape(21U, 38U), TensorShape(21U, 12U, 2U), -2, 13);
-        add_config(TensorShape(32U, 1U, 4U, 3U), TensorShape(17U, 32U), TensorShape(17U, 1U, 4U, 3U), 0, 4);
-        add_config(TensorShape(16U, 16U, 3U, 2U), TensorShape(8U, 16U), TensorShape(8U, 16U, 3U, 2U), -2, 0);
+        add_config(TensorShape(16U, 16U, 3U, 2U), TensorShape(15U, 16U), TensorShape(15U, 16U, 3U, 2U), -2, 0);
         add_config(TensorShape(16U, 16U, 5U, 3U), TensorShape(8U, 16U), TensorShape(8U, 16U, 5U, 3U), -9, 1);
     }
 };
+
+class SmallGEMMLowpBatchedMatMulDataset final : public GEMMLowpDataset
+{
+public:
+    SmallGEMMLowpBatchedMatMulDataset()
+    {
+        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0);
+        add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U, 15U), 0, 0);
+        add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U, 17U), -2, 13);
+        add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U, 4U, 3U), -2, 0);
+        add_config(TensorShape(15U, 7U, 36U), TensorShape(29U, 15U, 36U), TensorShape(29U, 7U, 36U), -9, 1);
+        add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U, 17U, 32U), -3, 2);
+        add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U, 256U, 32U), 5, 13);
+    }
+};
+
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/datasets/SmallMatMulDataset.h b/tests/datasets/SmallMatMulDataset.h
new file mode 100644
index 0000000000..bb4cdad54b
--- /dev/null
+++ b/tests/datasets/SmallMatMulDataset.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_DATASETS_SMALLMATMULDATASET
+#define ACL_TESTS_DATASETS_SMALLMATMULDATASET
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/datasets/MatMulDataset.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class SmallMatMulDataset final : public MatMulDataset
+{
+public:
+    SmallMatMulDataset()
+    {
+        add_config(TensorShape(3U, 4U, 2U, 2U), TensorShape(2U, 3U, 2U, 2U), TensorShape(2U, 4U, 2U, 2U));
+        add_config(TensorShape(9U, 6U), TensorShape(5U, 9U), TensorShape(5U, 6U));
+        add_config(TensorShape(31U, 1U), TensorShape(23U, 31U), TensorShape(23U, 1U));
+        add_config(TensorShape(8U, 4U, 2U), TensorShape(16U, 8U, 2U), TensorShape(16U, 4U, 2U));
+        add_config(TensorShape(32U, 2U), TensorShape(17U, 32U), TensorShape(17U, 2U));
+    }
+};
+
+class SmallerMatMulDataset final : public MatMulDataset
+{
+public:
+    SmallerMatMulDataset()
+    {
+        add_config(TensorShape(9U, 6U), TensorShape(5U, 9U), TensorShape(5U, 6U));
+        add_config(TensorShape(8U, 4U, 2U), TensorShape(16U, 8U, 2U), TensorShape(16U, 4U, 2U));
+        add_config(TensorShape(32U, 2U), TensorShape(17U, 32U), TensorShape(17U, 2U));
+    }
+};
+
+class TinyMatMulDataset final : public MatMulDataset
+{
+public:
+    TinyMatMulDataset()
+    {
+        add_config(TensorShape(1U), TensorShape(1U), TensorShape(1U));
+        add_config(TensorShape(2U, 2U), TensorShape(2U, 2U), TensorShape(2U, 2U));
+    }
+};
+
+class SmallMatMulDatasetRhsExportToCLImageRhsT final : public MatMulDataset
+{
+public:
+    // Some considerations:
+    //  1) K dimension should be a multiple of 4
+    //  See (2), (3), and (4) in SmallMatMulDatasetRhsExportToCLImageRhsNT
+    SmallMatMulDatasetRhsExportToCLImageRhsT()
+    {
+        add_config(TensorShape(8U /*K*/, 3U /*M*/, 2U, 1U, 2U), TensorShape(20U /*N*/, 8U /*K*/, 2U, 1U, 2U), TensorShape(20U /*N*/, 3U /*M*/, 2U, 1U, 2U));
+    }
+};
+
+class SmallMatMulDatasetRhsExportToCLImageRhsNT final : public MatMulDataset
+{
+public:
+    // Some considerations:
+    //  (1) N (Dimension 0 of Rhs matrix) dimension should be a multiple of 4
+    //  (2) Having N=20 enables us to test all possible N0 values, i.e. 4, 8, 16
+    //  (3) It's important to have more than one loop iterations in the K dimension
+    //      K has been chosen in accordance with K0
+    //  (4) The 5-th dimension has been chosen as non-unit because export_to_cl_iamge checks
+    //      were using dim1 * dim2 * dim3 to calculate the CLImage height; however, in our case
+    //      the tensor can be > 4D. To stress that case, the fifth dimension is chosen to be non-unit as well
+    SmallMatMulDatasetRhsExportToCLImageRhsNT()
+    {
+        add_config(TensorShape(7U, 3U, 2U, 1U, 2U), TensorShape(20U, 7U, 2U, 1U, 2U), TensorShape(20U, 3U, 2U, 1U, 2U));
+    }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ACL_TESTS_DATASETS_SMALLMATMULDATASET */
diff --git a/tests/ILutAccessor.h b/tests/datasets/SmallMatMulMMULDataset.h
index 39fa202d2c..9e517488af 100644
--- a/tests/ILutAccessor.h
+++ b/tests/datasets/SmallMatMulMMULDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,52 +21,46 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_ILUTACCESSOR_H
-#define ARM_COMPUTE_TEST_ILUTACCESSOR_H
 
-#include "arm_compute/core/Coordinates.h"
+#ifndef ACL_TESTS_DATASETS_SMALLMATMULMMULDATASET
+#define ACL_TESTS_DATASETS_SMALLMATMULMMULDATASET
+
+#include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "tests/datasets/MatMulDataset.h"
 
 namespace arm_compute
 {
 namespace test
 {
-/** Common interface to provide information and access to Lut like
- * structures.
+namespace datasets
+{
+/** MatMul MMUL shapes are similar to MatMul shapes except that K has to be a multiple of MMUL_K0 which is 4 (e.g. see src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp for the definition)
  */
-template <typename T>
-class ILutAccessor
+class SmallMatMulMMULDataset final : public MatMulDataset
 {
 public:
-    /** Lut value type */
-    using value_type = T;
-
-    /** Pure virtual destructor. */
-    virtual ~ILutAccessor() = default;
-
-    /** Number of elements of the Lut.
-     *
-     * @return the number of elements.
-     */
-    virtual int num_elements() const = 0;
-
-    /** Read access to the specified element.
-     *
-     * @param[in] input_value Lut input value, from numericlimits<T>:min to numericlimits<T>:max.
-     *
-     * @return Output desired element.
-     */
-    virtual const T &operator[](T input_value) const = 0;
+    SmallMatMulMMULDataset()
+    {
+        add_config(TensorShape(8U, 4U, 2U, 2U), TensorShape(2U, 8U, 2U, 2U), TensorShape(2U, 4U, 2U, 2U));
+        add_config(TensorShape(28U, 1U), TensorShape(23U, 28U), TensorShape(23U, 1U));
+        add_config(TensorShape(8U, 4U, 2U), TensorShape(16U, 8U, 2U), TensorShape(16U, 4U, 2U));
+        add_config(TensorShape(32U, 2U), TensorShape(17U, 32U), TensorShape(17U, 2U));
+        add_config(TensorShape(8U, 6U), TensorShape(7U, 8U), TensorShape(7U, 6U));
+    }
+};
 
-    /** Write access to the specified element.
-     *
-     * @param[in] input_value Lut input value, from numericlimits<T>:min to numericlimits<T>:max.
-     *
-     * @return Output desired element.
-     */
-    virtual T &operator[](T input_value) = 0;
+class TinyMatMulMMULDataset final : public MatMulDataset
+{
+public:
+    TinyMatMulMMULDataset()
+    {
+        add_config(TensorShape(4U, 4U), TensorShape(4U, 4U), TensorShape(4U, 4U));
+    }
 };
 
+} // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_ILUTACCESSOR_H */
+
+#endif /* ACL_TESTS_DATASETS_SMALLMATMULMMULDATASET */
diff --git a/tests/datasets/dynamic_fusion/PoolingLayerDataset.h b/tests/datasets/dynamic_fusion/PoolingLayerDataset.h
new file mode 100644
index 0000000000..c4911f4940
--- /dev/null
+++ b/tests/datasets/dynamic_fusion/PoolingLayerDataset.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "utils/TypePrinter.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
+
+using Pool2dAttributes = arm_compute::experimental::dynamic_fusion::Pool2dAttributes;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+
+class DynamicFusionPoolingLayerDataset
+{
+public:
+    using type = std::tuple<TensorShape, Pool2dAttributes>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator      src_it,
+                 std::vector<Pool2dAttributes>::const_iterator infos_it)
+            : _src_it{ std::move(src_it) },
+              _infos_it{ std::move(infos_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "In=" << *_src_it << ":";
+            description << "Info=" << *_infos_it << ":";
+            return description.str();
+        }
+
+        DynamicFusionPoolingLayerDataset::type operator*() const
+        {
+            return std::make_tuple(*_src_it, *_infos_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_src_it;
+            ++_infos_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator      _src_it;
+        std::vector<Pool2dAttributes>::const_iterator _infos_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_src_shapes.begin(), _infos.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_src_shapes.size(), _infos.size());
+    }
+
+    void add_config(TensorShape src, Pool2dAttributes info)
+    {
+        _src_shapes.emplace_back(std::move(src));
+        _infos.emplace_back(std::move(info));
+    }
+
+protected:
+    DynamicFusionPoolingLayerDataset()                       = default;
+    DynamicFusionPoolingLayerDataset(DynamicFusionPoolingLayerDataset &&) = default;
+
+private:
+    std::vector<TensorShape>      _src_shapes{};
+    std::vector<Pool2dAttributes> _infos{};
+};
+
+// Special pooling dataset
+class PoolingLayerDatasetSpecialDynamicFusion final : public DynamicFusionPoolingLayerDataset
+{
+public:
+    PoolingLayerDatasetSpecialDynamicFusion()
+    {
+        // NCHW DataLayout 
+        // Special cases
+        add_config(TensorShape(2U, 3U, 4U, 1U), Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(2,2)).stride(Size2D(3,3)));
+        add_config(TensorShape(60U, 52U, 3U, 2U), Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(100,100)).stride(Size2D(5,5)).pad(Padding2D(50,50,50,50)));
+        // Asymmetric padding
+        add_config(TensorShape(112U, 112U, 32U), Pool2dAttributes().pool_type(PoolingType::MAX).pool_size(Size2D(3,3)).pad(Padding2D(0,1,0,1)).stride(Size2D(2,2)));
+        add_config(TensorShape(14U, 14U, 832U), Pool2dAttributes().pool_type(PoolingType::MAX).pool_size(Size2D(2,2)).stride(Size2D(1,1)).pad(Padding2D(0,0,0,0)));
+
+    }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/tests/datasets/system_tests/yolo/v2/YOLOV2ActivationLayerDataset.h b/tests/datasets/system_tests/yolo/v2/YOLOV2ActivationLayerDataset.h
deleted file mode 100644
index ec6f470dd7..0000000000
--- a/tests/datasets/system_tests/yolo/v2/YOLOV2ActivationLayerDataset.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_YOLOV2_ACTIVATION_LAYER_DATASET
-#define ARM_COMPUTE_TEST_YOLOV2_ACTIVATION_LAYER_DATASET
-
-#include "tests/framework/datasets/Datasets.h"
-
-#include "utils/TypePrinter.h"
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace datasets
-{
-class YOLOV2ActivationLayerRELUDataset final : public
-    framework::dataset::CartesianProductDataset<framework::dataset::InitializerListDataset<TensorShape>, framework::dataset::SingletonDataset<ActivationLayerInfo>>
-{
-public:
-    YOLOV2ActivationLayerRELUDataset()
-        : CartesianProductDataset
-    {
-        framework::dataset::make("Shape", { // relu1
-            TensorShape(416U, 416U, 32U),
-            // relu2
-            TensorShape(208U, 208U, 64U),
-            // relu3, relu5
-            TensorShape(104U, 104U, 128U),
-            // relu4
-            TensorShape(104U, 104U, 64U),
-            // relu6, relu8
-            TensorShape(52U, 52U, 256U),
-            // relu7
-            TensorShape(52U, 52U, 128U),
-            // relu9, relu11, relu13
-            TensorShape(26U, 26U, 512U),
-            // relu10, relu12
-            TensorShape(26U, 26U, 256U),
-            // relu14, relu16, relu18, relu19, relu20, relu21
-            TensorShape(13U, 13U, 1024U),
-            // relu15, relu17
-            TensorShape(13U, 13U, 512U) }),
-        framework::dataset::make("Info", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
-    }
-    {
-    }
-    YOLOV2ActivationLayerRELUDataset(YOLOV2ActivationLayerRELUDataset &&) = default;
-    ~YOLOV2ActivationLayerRELUDataset()                                   = default;
-};
-
-class YOLOV2ActivationLayerLINEARDataset final : public
-    framework::dataset::CartesianProductDataset<framework::dataset::InitializerListDataset<TensorShape>, framework::dataset::SingletonDataset<ActivationLayerInfo>>
-{
-public:
-    YOLOV2ActivationLayerLINEARDataset()
-        : CartesianProductDataset
-    {
-        framework::dataset::make("Shape", { // linear22
-            TensorShape(15U, 15U, 425U) }),
-        framework::dataset::make("Info", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR))
-    }
-    {
-    }
-    YOLOV2ActivationLayerLINEARDataset(YOLOV2ActivationLayerLINEARDataset &&) = default;
-    ~YOLOV2ActivationLayerLINEARDataset()                                     = default;
-};
-
-class YOLOV2ActivationLayerDataset final : public framework::dataset::JoinDataset<YOLOV2ActivationLayerRELUDataset, YOLOV2ActivationLayerLINEARDataset>
-{
-public:
-    YOLOV2ActivationLayerDataset()
-        : JoinDataset
-    {
-        YOLOV2ActivationLayerRELUDataset(),
-        YOLOV2ActivationLayerLINEARDataset()
-    }
-    {
-    }
-    YOLOV2ActivationLayerDataset(YOLOV2ActivationLayerDataset &&) = default;
-    ~YOLOV2ActivationLayerDataset()                               = default;
-};
-} // namespace datasets
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_YOLOV2_ACTIVATION_LAYER_DATASET */
diff --git a/tests/datasets/system_tests/yolo/v2/YOLOV2BatchNormalizationLayerDataset.h b/tests/datasets/system_tests/yolo/v2/YOLOV2BatchNormalizationLayerDataset.h
deleted file mode 100644
index 9997ffc0c0..0000000000
--- a/tests/datasets/system_tests/yolo/v2/YOLOV2BatchNormalizationLayerDataset.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_YOLOV2_BATCHNORMALIZATION_LAYER_DATASET
-#define ARM_COMPUTE_TEST_YOLOV2_BATCHNORMALIZATION_LAYER_DATASET
-
-#include "tests/datasets/BatchNormalizationLayerDataset.h"
-
-#include "utils/TypePrinter.h"
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace datasets
-{
-class YOLOV2BatchNormalizationLayerDataset final : public BatchNormalizationLayerDataset
-{
-public:
-    YOLOV2BatchNormalizationLayerDataset()
-    {
-        // conv1_bn
-        add_config(TensorShape(416U, 416U, 32U), TensorShape(32U), 0.00001f);
-        // conv2_bn
-        add_config(TensorShape(208U, 208U, 64U), TensorShape(64U), 0.00001f);
-        // conv3_bn, conv5_bn
-        add_config(TensorShape(104U, 104U, 128U), TensorShape(128U), 0.00001f);
-        // conv4_bn
-        add_config(TensorShape(104U, 104U, 64U), TensorShape(64U), 0.00001f);
-        // conv6_bn, conv8_bn
-        add_config(TensorShape(52U, 52U, 256U), TensorShape(256U), 0.00001f);
-        // conv7_bn
-        add_config(TensorShape(52U, 52U, 128U), TensorShape(128U), 0.00001f);
-        // conv9_bn, conv11_bn, conv13_bn
-        add_config(TensorShape(26U, 26U, 512U), TensorShape(512U), 0.00001f);
-        // conv10_bn, conv12_bn
-        add_config(TensorShape(26U, 26U, 256U), TensorShape(256U), 0.00001f);
-        // conv14_bn, conv16_bn, conv18_bn, conv19_bn, conv20_bn, conv21_bn
-        add_config(TensorShape(13U, 13U, 1024U), TensorShape(1024U), 0.00001f);
-        // conv15_bn, conv17_bn
-        add_config(TensorShape(13U, 13U, 512U), TensorShape(512U), 0.00001f);
-    }
-};
-} // namespace datasets
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_YOLOV2_BATCHNORMALIZATION_LAYER_DATASET */
diff --git a/tests/datasets/system_tests/yolo/v2/YOLOV2ConvolutionLayerDataset.h b/tests/datasets/system_tests/yolo/v2/YOLOV2ConvolutionLayerDataset.h
deleted file mode 100644
index 67d57bf680..0000000000
--- a/tests/datasets/system_tests/yolo/v2/YOLOV2ConvolutionLayerDataset.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_YOLOV2_CONVOLUTION_LAYER_DATASET
-#define ARM_COMPUTE_TEST_YOLOV2_CONVOLUTION_LAYER_DATASET
-
-#include "tests/datasets/ConvolutionLayerDataset.h"
-
-#include "utils/TypePrinter.h"
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace datasets
-{
-class YOLOV2ConvolutionLayerDataset final : public ConvolutionLayerDataset
-{
-public:
-    YOLOV2ConvolutionLayerDataset()
-    {
-        // conv1
-        add_config(TensorShape(416U, 416U, 3U), TensorShape(3U, 3U, 3U, 32U), TensorShape(32U), TensorShape(416U, 416U, 32U), PadStrideInfo(1, 1, 1, 1));
-        // conv2
-        add_config(TensorShape(208U, 208U, 32U), TensorShape(3U, 3U, 32U, 64U), TensorShape(64U), TensorShape(208U, 208U, 64U), PadStrideInfo(1, 1, 1, 1));
-        // conv3, conv5
-        add_config(TensorShape(104U, 104U, 64U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(104U, 104U, 128U), PadStrideInfo(1, 1, 1, 1));
-        // conv4
-        add_config(TensorShape(104U, 104U, 128U), TensorShape(1U, 1U, 128U, 64U), TensorShape(64U), TensorShape(104U, 104U, 64U), PadStrideInfo(1, 1, 0, 0));
-        // conv6, conv8
-        add_config(TensorShape(52U, 52U, 128U), TensorShape(3U, 3U, 128U, 256U), TensorShape(256U), TensorShape(52U, 52U, 256U), PadStrideInfo(1, 1, 1, 1));
-        // conv7
-        add_config(TensorShape(52U, 52U, 256U), TensorShape(1U, 1U, 256U, 128U), TensorShape(128U), TensorShape(52U, 52U, 128U), PadStrideInfo(1, 1, 0, 0));
-        // conv9, conv11, conv13
-        add_config(TensorShape(26U, 26U, 256U), TensorShape(3U, 3U, 256U, 512U), TensorShape(512U), TensorShape(26U, 26U, 512U), PadStrideInfo(1, 1, 1, 1));
-        // conv10, conv12
-        add_config(TensorShape(26U, 26U, 512U), TensorShape(1U, 1U, 512U, 256U), TensorShape(256U), TensorShape(26U, 26U, 256U), PadStrideInfo(1, 1, 0, 0));
-        // conv14, conv16, conv18
-        add_config(TensorShape(13U, 13U, 512U), TensorShape(3U, 3U, 512U, 1024U), TensorShape(1024U), TensorShape(13U, 13U, 1024U), PadStrideInfo(1, 1, 1, 1));
-        // conv15, conv17
-        add_config(TensorShape(13U, 13U, 1024U), TensorShape(1U, 1U, 1024U, 512U), TensorShape(512U), TensorShape(13U, 13U, 512U), PadStrideInfo(1, 1, 0, 0));
-        // conv19, conv20
-        add_config(TensorShape(13U, 13U, 1024U), TensorShape(3U, 3U, 1024U, 1024U), TensorShape(1024U), TensorShape(13U, 13U, 1024U), PadStrideInfo(1, 1, 1, 1));
-        // conv21
-        add_config(TensorShape(13U, 13U, 3072U), TensorShape(3U, 3U, 3072U, 1024U), TensorShape(1024U), TensorShape(13U, 13U, 1024U), PadStrideInfo(1, 1, 1, 1));
-        // conv22
-        add_config(TensorShape(13U, 13U, 1024U), TensorShape(1U, 1U, 1024U, 425U), TensorShape(425U), TensorShape(13U, 13U, 425U), PadStrideInfo(1, 1, 0, 0));
-    }
-};
-} // namespace datasets
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_YOLOV2_CONVOLUTION_LAYER_DATASET */
diff --git a/tests/framework/Asserts.h b/tests/framework/Asserts.h
index b8a8fe091c..7adfa8f2f3 100644
--- a/tests/framework/Asserts.h
+++ b/tests/framework/Asserts.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include <sstream>
 #include <type_traits>
 
+#include "utils/TypePrinter.h"
+
 namespace arm_compute
 {
 namespace test
@@ -42,6 +44,11 @@ inline int make_printable(int8_t value)
     return value;
 }
 
+inline std::string make_printable(const arm_compute::WeightFormat wf)
+{
+    return arm_compute::to_string(wf);
+}
+
 inline unsigned int make_printable(uint8_t value)
 {
     return value;
@@ -153,6 +160,7 @@ ARM_COMPUTE_TEST_COMP_FACTORY(ASSERT, Assertion, !=, NOT_EQUAL, throw arm_comput
         arm_compute::test::framework::Framework::get().clear_test_info();                                                                     \
     } while(false)
 
+#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
 #define ARM_COMPUTE_EXPECT_THROW(X, LEVEL)                                                                                                    \
     do                                                                                                                                        \
     {                                                                                                                                         \
@@ -175,6 +183,17 @@ ARM_COMPUTE_TEST_COMP_FACTORY(ASSERT, Assertion, !=, NOT_EQUAL, throw arm_comput
         }                                                                                                                                     \
         arm_compute::test::framework::Framework::get().clear_test_info();                                                                     \
     } while(false)
+#else // defined(ARM_COMPUTE_ASSERTS_ENABLED)
+#define ARM_COMPUTE_EXPECT_THROW(X, LEVEL)                                   \
+    do                                                                       \
+    {                                                                        \
+        std::stringstream msg;                                               \
+        msg << "'" #X "' Skipped: asserts disabled, cannot throw\n";         \
+        arm_compute::test::framework::Framework::get().print_test_info(msg); \
+        arm_compute::test::framework::Framework::get().log_info(msg.str());  \
+        arm_compute::test::framework::Framework::get().clear_test_info();    \
+    } while(false)
+#endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
 
 #define ARM_COMPUTE_ASSERT_FAIL(MSG)                                                                              \
     do                                                                                                            \
diff --git a/tests/framework/BUILD.bazel b/tests/framework/BUILD.bazel
new file mode 100644
index 0000000000..17d5a15a11
--- /dev/null
+++ b/tests/framework/BUILD.bazel
@@ -0,0 +1,63 @@
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+cc_library(
+    name = "framework",
+    srcs = glob(
+        [
+            "*.cpp",
+            "command_line/*.cpp",
+            "printers/*.cpp",
+            "datasets/*.cpp",
+            "instruments/*.cpp",
+            "instruments/*.hpp",
+        ],
+        exclude = [
+            "**/*PMU*",
+            "**/*OpenCL*",
+            "**/*MaliCounter*",
+        ],
+    ),
+    hdrs = glob([
+        "*.h",
+        "command_line/*.h",
+        "printers/*.h",
+        "datasets/*.h",
+        "instruments/*.h",
+    ]),
+    copts = [
+        "-Wno-overloaded-virtual",
+    ] + select({
+                  "//:arch_armv8-a": ["-march=armv8-a"],
+                  "//:arch_armv8.2-a+fp16": ["-march=armv8.2-a+fp16"],
+                  "//conditions:default": ["-march=armv8-a"],
+              }),
+    linkstatic = True,
+    visibility = ["//visibility:public"],
+    deps = [
+        "//arm_compute:core_headers",
+        "//arm_compute:graph_headers",
+        "//arm_compute:runtime_headers",
+        "//support",
+        "//utils",
+    ],
+)
diff --git a/tests/framework/Framework.cpp b/tests/framework/Framework.cpp
index 436aac0a34..bfb955c525 100644
--- a/tests/framework/Framework.cpp
+++ b/tests/framework/Framework.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -130,10 +130,12 @@ Framework &Framework::get()
 void Framework::init(const FrameworkConfig &config)
 {
     _test_filter.reset(new TestFilter(config.mode, config.name_filter, config.id_filter));
-    _num_iterations = config.num_iterations;
-    _log_level      = config.log_level;
-    _cooldown_sec   = config.cooldown_sec;
-    _configure_only = config.configure_only;
+    _num_iterations  = config.num_iterations;
+    _log_level       = config.log_level;
+    _cooldown_sec    = config.cooldown_sec;
+    _configure_only  = config.configure_only;
+    _print_rerun_cmd = config.print_rerun_cmd;
+    _seed            = config.seed;
 
     _instruments = std::set<framework::InstrumentsDescription>(std::begin(config.instruments), std::end(config.instruments));
 }
@@ -209,6 +211,7 @@ void Framework::log_test_end(const TestInfo &info)
     {
         func_on_all_printers([&](Printer * p)
         {
+            p->print_profiler_header(_test_results.at(info).header_data);
             p->print_measurements(_test_results.at(info).measurements);
         });
     }
@@ -291,13 +294,13 @@ bool Framework::error_on_missing_assets() const
     return _error_on_missing_assets;
 }
 
-void Framework::run_test(const TestInfo &info, TestCaseFactory &test_factory)
+TestResult::Status Framework::run_test(const TestInfo &info, TestCaseFactory &test_factory)
 {
     if(test_factory.status() == TestCaseFactory::Status::DISABLED)
     {
         log_test_skipped(info);
         set_test_result(info, TestResult(TestResult::Status::DISABLED));
-        return;
+        return TestResult::Status::DISABLED;
     }
 
     log_test_start(info);
@@ -528,14 +531,16 @@ void Framework::run_test(const TestInfo &info, TestCaseFactory &test_factory)
     {
         if(_stop_on_error)
         {
-            throw std::runtime_error("Abort on first error.");
+            throw std::runtime_error("Abandon on first error.");
         }
     }
 
+    result.header_data  = profiler.header();
     result.measurements = profiler.measurements();
 
     set_test_result(info, result);
     log_test_end(info);
+    return result.status;
 }
 
 bool Framework::run()
@@ -555,6 +560,7 @@ bool Framework::run()
 
     int id          = 0;
     int id_run_test = 0;
+    ARM_COMPUTE_UNUSED(id_run_test); // Not used if ARM_COMPUTE_CL is not defined
 
     for(auto &test_factory : _test_factories)
     {
@@ -578,9 +584,11 @@ bool Framework::run()
                 CLScheduler::get().set_queue(new_queue);
             }
 #endif // ARM_COMPUTE_CL
-
-            run_test(test_info, *test_factory);
-
+            TestResult::Status result = run_test(test_info, *test_factory);
+            if((_print_rerun_cmd) && (result == TestResult::Status::CRASHED || result == TestResult::Status::FAILED))
+            {
+                std::cout << "Rerun command: ./arm_compute_validation --filter='^" << test_info.name << "$' --seed=" << _seed << std::endl;
+            }
             ++id_run_test;
 
             // Run test delay
@@ -630,6 +638,7 @@ void Framework::print_test_results(Printer &printer) const
     for(const auto &test : _test_results)
     {
         printer.print_test_header(test.first);
+        printer.print_profiler_header(test.second.header_data);
         printer.print_measurements(test.second.measurements);
         printer.print_test_footer();
     }
@@ -679,7 +688,7 @@ std::vector<TestInfo> Framework::test_infos() const
 
     for(const auto &factory : _test_factories)
     {
-        TestInfo test_info{ id, factory->name(), factory->mode(), factory->status() };
+        const TestInfo test_info{ id, factory->name(), factory->mode(), factory->status() };
 
         if(_test_filter->is_selected(test_info))
         {
diff --git a/tests/framework/Framework.h b/tests/framework/Framework.h
index 4c2e86c6ea..2dded30038 100644
--- a/tests/framework/Framework.h
+++ b/tests/framework/Framework.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,6 +64,8 @@ struct FrameworkConfig
     float                                          cooldown_sec{ -1.f };        /**< Delay between tests in seconds. */
     LogLevel                                       log_level{ LogLevel::NONE }; /**< Verbosity of the output. */
     bool                                           configure_only{ false };     /**< Only configure kernels */
+    bool                                           print_rerun_cmd{ false };    /**< Print the command to rerun the failed testcase */
+    unsigned int                                   seed{ 0 };                   /**< The seed that is used to fill tensors with random values.*/
 };
 
 /** Information about a test case.
@@ -121,8 +123,6 @@ public:
      * registering test cases.
      *
      * @param[in] name Name of the added test suite.
-     *
-     * @return Name of the current test suite.
      */
     void push_suite(std::string name);
 
@@ -230,13 +230,13 @@ public:
 
     /** Indicates if test execution is stopped after the first failed test.
      *
-     * @return True if the execution is going to be aborted after the first failed test.
+     * @return True if the execution is going to be stopped after the first failed test.
      */
     bool stop_on_error() const;
 
-    /** Set whether to abort execution after the first failed test.
+    /** Set whether to stop execution after the first failed test.
      *
-     * @param[in] stop_on_error True if execution is going to be aborted after first failed test.
+     * @param[in] stop_on_error True if execution is going to be stopped after first failed test.
      */
     void set_stop_on_error(bool stop_on_error);
 
@@ -330,7 +330,7 @@ private:
     Framework(const Framework &) = delete;
     Framework &operator=(const Framework &) = delete;
 
-    void run_test(const TestInfo &info, TestCaseFactory &test_factory);
+    TestResult::Status run_test(const TestInfo &info, TestCaseFactory &test_factory);
     std::map<TestResult::Status, int> count_test_results() const;
 
     /** Returns the current test suite name.
@@ -358,6 +358,8 @@ private:
     std::vector<Printer *> _printers{};
     bool                   _configure_only{ false };
     bool                   _new_fixture_call{ false };
+    bool                   _print_rerun_cmd{ false };
+    unsigned int           _seed{ 0 };
 
     using create_function = std::unique_ptr<Instrument>();
     std::map<InstrumentsDescription, create_function *> _available_instruments{};
diff --git a/tests/framework/Macros.h b/tests/framework/Macros.h
index 23c826657d..5ce0842864 100644
--- a/tests/framework/Macros.h
+++ b/tests/framework/Macros.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,8 +49,8 @@
 
 #define CONCAT(ARG0, ARG1) ARG0##ARG1
 
-#define VARIADIC_SIZE_IMPL(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, size, ...) size
-#define VARIADIC_SIZE(...) VARIADIC_SIZE_IMPL(__VA_ARGS__, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define VARIADIC_SIZE_IMPL(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, size, ...) size
+#define VARIADIC_SIZE(...) VARIADIC_SIZE_IMPL(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
 
 #define JOIN_PARAM1(OP, param) OP(0, param)
 #define JOIN_PARAM2(OP, param, ...) \
@@ -92,6 +92,12 @@
 #define JOIN_PARAM14(OP, param, ...) \
     OP(13, param)                    \
     , JOIN_PARAM13(OP, __VA_ARGS__)
+#define JOIN_PARAM15(OP, param, ...) \
+    OP(14, param)                    \
+    , JOIN_PARAM14(OP, __VA_ARGS__)
+#define JOIN_PARAM16(OP, param, ...) \
+    OP(15, param)                    \
+    , JOIN_PARAM15(OP, __VA_ARGS__)
 #define JOIN_PARAM(OP, NUM, ...) \
     CONCAT(JOIN_PARAM, NUM)      \
     (OP, __VA_ARGS__)
@@ -121,13 +127,13 @@
     void do_setup() override                                     \
     {                                                            \
         framework::Framework::get().set_new_fixture_call(false); \
-        apply(this, &FIXTURE::setup<As...>, _data);              \
+        apply(this, &FIXTURE::setup, _data);              \
     }
 #define FIXTURE_DATA_SETUP_NEW(FIXTURE)                         \
     void do_setup() override                                    \
     {                                                           \
         framework::Framework::get().set_new_fixture_call(true); \
-        apply(this, &FIXTURE::setup<As...>, _data);             \
+        apply(this, &FIXTURE::setup, _data);             \
         configure_target();                                     \
         if(!framework::Framework::get().configure_only())       \
         {                                                       \
@@ -224,6 +230,11 @@
 #define DISABLED_FIXTURE_TEST_CASE(TEST_NAME, FIXTURE, MODE) \
     FIXTURE_TEST_CASE_IMPL(TEST_NAME, FIXTURE, MODE, arm_compute::test::framework::TestCaseFactory::Status::DISABLED)
 
+#define EMPTY_BODY_FIXTURE_TEST_CASE(TEST_NAME, FIXTURE, MODE) \
+    FIXTURE_TEST_CASE(TEST_NAME, FIXTURE, MODE)                \
+    {                                                          \
+    }
+
 #define FIXTURE_DATA_TEST_CASE_IMPL(TEST_NAME, FIXTURE, MODE, STATUS, DATASET)                                                      \
     template <typename T>                                                                                                           \
     class TEST_NAME;                                                                                                                \
@@ -313,4 +324,4 @@
 //
 // TEST CASE MACROS END
 //
-#endif /* ARM_COMPUTE_TEST_FRAMEWORK_MACROS */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_TEST_FRAMEWORK_MACROS */
diff --git a/tests/framework/Profiler.cpp b/tests/framework/Profiler.cpp
index b527eb4f09..a4a9beaa29 100644
--- a/tests/framework/Profiler.cpp
+++ b/tests/framework/Profiler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,6 +81,8 @@ void Profiler::test_stop()
         {
             _measurements[instrument->id() + "/" + measurement.first].push_back(measurement.second);
         }
+
+        _header_data = instrument->instrument_header();
     }
 }
 
@@ -88,6 +90,11 @@ const Profiler::MeasurementsMap &Profiler::measurements() const
 {
     return _measurements;
 }
+
+const std::string &Profiler::header() const
+{
+    return _header_data;
+}
 } // namespace framework
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/framework/Profiler.h b/tests/framework/Profiler.h
index 588276fcd5..7df085c8f1 100644
--- a/tests/framework/Profiler.h
+++ b/tests/framework/Profiler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -87,9 +87,16 @@ public:
      */
     const MeasurementsMap &measurements() const;
 
+    /** Return JSON formatted header data.
+     *
+     * @returns JSON formmated string
+     */
+    const std::string &header() const;
+
 private:
     std::vector<std::unique_ptr<Instrument>> _instruments{};
     MeasurementsMap                          _measurements{};
+    std::string                              _header_data{};
 };
 } // namespace framework
 } // namespace test
diff --git a/tests/framework/SConscript b/tests/framework/SConscript
index e805ac0e2c..450ffd77b0 100644
--- a/tests/framework/SConscript
+++ b/tests/framework/SConscript
@@ -1,4 +1,7 @@
-# Copyright (c) 2017 Arm Limited.
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2017-2022 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -27,8 +30,8 @@ Import('vars')
 
 # vars is imported from arm_compute:
 variables = [
-    BoolVariable("pmu", "Enable PMU counters", False),
-    BoolVariable("mali", "Enable Mali hardware counters", False),
+    BoolVariable("pmu", "Enable the PMU cycle counter to measure execution time in benchmark tests. (Your device needs to support it)", False),
+    BoolVariable("mali", "Enable the collection of Arm® Mali™ hardware counters to measure execution time in benchmark tests. (Your device needs to have a Arm® Mali™ driver that supports it)", False),
 ]
 
 # We need a separate set of Variables for the Help message (Otherwise the global variables will get displayed twice)
@@ -67,7 +70,7 @@ if not env['opencl']:
     files = [f for f in files if "OpenCL" not in os.path.basename(str(f))]
 
 if not framework_env['mali']:
-    # Remove Mali files
+    # Remove Arm® Mali™ files
     files = [f for f in files if "MaliCounter" not in os.path.basename(str(f))]
 else:
     framework_env.Append(CPPDEFINES = ['MALI_ENABLED'])
diff --git a/tests/framework/TestResult.h b/tests/framework/TestResult.h
index 10f10c1d86..18e54343dc 100644
--- a/tests/framework/TestResult.h
+++ b/tests/framework/TestResult.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -73,6 +73,7 @@ struct TestResult
 
     Status                    status{ Status::NOT_RUN }; /**< Execution status */
     Profiler::MeasurementsMap measurements{};            /**< Profiling information */
+    std::string               header_data{};             /**< Test header data */
 };
 } // namespace framework
 } // namespace test
diff --git a/tests/framework/command_line/CommonOptions.cpp b/tests/framework/command_line/CommonOptions.cpp
index 6fb37470c1..e6f1929bb1 100644
--- a/tests/framework/command_line/CommonOptions.cpp
+++ b/tests/framework/command_line/CommonOptions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,9 @@
 
 #include "../Framework.h"
 #include "../printers/Printers.h"
+#if !defined(_WIN64)
 #include <unistd.h>
+#endif // !defined(_WIN64)
 
 using namespace arm_compute::utils;
 
@@ -43,7 +45,11 @@ CommonOptions::CommonOptions(CommandLineParser &parser)
       log_file(parser.add_option<SimpleOption<std::string>>("log-file")),
       log_level(),
       throw_errors(parser.add_option<ToggleOption>("throw-errors")),
-      color_output(parser.add_option<ToggleOption>("color-output", isatty(STDOUT_FILENO))), // Only enable colors by default if we're running in a terminal
+#if !defined(_WIN64)
+     color_output(parser.add_option<ToggleOption>("color-output", isatty(STDOUT_FILENO))), // Only enable colors by default if we're running in a terminal
+#else // !defined(_WIN64)
+     color_output(parser.add_option<ToggleOption>("color-output", 0)),
+#endif // !defined(_WIN64)
       pretty_console(parser.add_option<ToggleOption>("pretty-console", false)),
       json_file(parser.add_option<SimpleOption<std::string>>("json-file")),
       pretty_file(parser.add_option<SimpleOption<std::string>>("pretty-file")),
diff --git a/tests/framework/datasets/CartesianProductDataset.h b/tests/framework/datasets/CartesianProductDataset.h
index 19ac4f6666..7b3ff12047 100644
--- a/tests/framework/datasets/CartesianProductDataset.h
+++ b/tests/framework/datasets/CartesianProductDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -186,6 +186,20 @@ CartesianProductDataset<T, U> combine(T &&dataset1, U &&dataset2)
  *
  * @param[in] dataset1 First dataset.
  * @param[in] dataset2 Second dataset.
+ * @param[in] datasets Subsequent dataset.
+ *
+ * @return A grid dataset.
+ */
+template <typename T1, typename T2, typename... Ts>
+auto combine(T1 &&dataset1, T2 &&dataset2, Ts &&... datasets) -> decltype(combine(std::forward<T1>(dataset1), combine(std::forward<T2>(dataset2), std::forward<Ts>(datasets)...)))
+{
+    return combine(std::forward<T1>(dataset1), combine(std::forward<T2>(dataset2), std::forward<Ts>(datasets)...));
+}
+
+/** Helper function to create a @ref CartesianProductDataset.
+ *
+ * @param[in] dataset1 First dataset.
+ * @param[in] dataset2 Second dataset.
  *
  * @return A grid dataset.
  */
diff --git a/tests/framework/datasets/ContainerDataset.h b/tests/framework/datasets/ContainerDataset.h
index 3987e8f1b8..de77cb24d6 100644
--- a/tests/framework/datasets/ContainerDataset.h
+++ b/tests/framework/datasets/ContainerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,6 @@
 
 #include "Dataset.h"
 #include "support/StringSupport.h"
-#include "tests/TypePrinter.h"
 
 #include <string>
 #include <tuple>
diff --git a/tests/framework/datasets/ZipDataset.h b/tests/framework/datasets/ZipDataset.h
index ce1bb37cab..0b963484c5 100644
--- a/tests/framework/datasets/ZipDataset.h
+++ b/tests/framework/datasets/ZipDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -150,6 +150,20 @@ ZipDataset<T, U> zip(T &&dataset1, U &&dataset2)
 {
     return ZipDataset<T, U>(std::forward<T>(dataset1), std::forward<U>(dataset2));
 }
+
+/** Helper function to create a @ref ZipDataset.
+ *
+ * @param[in] dataset1 First dataset.
+ * @param[in] dataset2 Second dataset.
+ * @param[in] datasets Subsequent datasets.
+ *
+ * @return A zip dataset.
+ */
+template <typename T1, typename T2, typename... Ts>
+auto zip(T1 &&dataset1, T2 &&dataset2, Ts &&... datasets) -> decltype(zip(std::forward<T1>(dataset1), zip(std::forward<T2>(dataset2), std::forward<Ts>(datasets)...)))
+{
+    return zip(std::forward<T1>(dataset1), zip(std::forward<T2>(dataset2), std::forward<Ts>(datasets)...));
+}
 } // namespace dataset
 } // namespace framework
 } // namespace test
diff --git a/tests/framework/instruments/Instrument.h b/tests/framework/instruments/Instrument.h
index 3ea15825ad..1770a492ac 100644
--- a/tests/framework/instruments/Instrument.h
+++ b/tests/framework/instruments/Instrument.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,6 +117,15 @@ public:
         return MeasurementsMap();
     }
 
+    /** Return JSON formatted instrument header string.
+     *
+     * @return JSON formatted string
+     */
+    virtual std::string instrument_header() const
+    {
+        return std::string{};
+    }
+
     /** Return the latest test measurements.
      *
      * @return the latest test measurements.
diff --git a/tests/framework/instruments/Instruments.h b/tests/framework/instruments/Instruments.h
index 8a6cec0e9c..d80032a032 100644
--- a/tests/framework/instruments/Instruments.h
+++ b/tests/framework/instruments/Instruments.h
@@ -24,12 +24,12 @@
 #ifndef ARM_COMPUTE_TEST_INSTRUMENTS
 #define ARM_COMPUTE_TEST_INSTRUMENTS
 
-#if !defined(BARE_METAL) && !defined(__APPLE__)
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__)
 #include "MaliCounter.h"
 #include "OpenCLMemoryUsage.h"
 #include "OpenCLTimer.h"
 #include "PMUCounter.h"
-#endif /* !defined(BARE_METAL) && !defined(__APPLE__) */
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) */
 #include "SchedulerTimer.h"
 #include "WallClockTimer.h"
 
diff --git a/tests/framework/instruments/Measurement.h b/tests/framework/instruments/Measurement.h
index af272a9945..2ec68d424b 100644
--- a/tests/framework/instruments/Measurement.h
+++ b/tests/framework/instruments/Measurement.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -209,10 +209,10 @@ struct Measurement
 
         /** Stored value */
         union
-            {
-                double        floating_point;
-                long long int integer;
-            } v;
+        {
+            double        floating_point;
+            long long int integer;
+        } v;
         bool is_floating_point; /**< Is the stored value floating point or integer ? */
     };
 
diff --git a/tests/framework/instruments/OpenCLTimer.cpp b/tests/framework/instruments/OpenCLTimer.cpp
index 45eb4c5c60..e9f945bd95 100644
--- a/tests/framework/instruments/OpenCLTimer.cpp
+++ b/tests/framework/instruments/OpenCLTimer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,13 @@ std::string    OpenCLClock<output_timestamps>::id() const
 
 template <bool output_timestamps>
 OpenCLClock<output_timestamps>::OpenCLClock(ScaleFactor scale_factor)
-    : _kernels(), _real_function(nullptr), _real_graph_function(nullptr), _prefix(), _timer_enabled(false)
+    : _kernels(),
+      _real_function(nullptr),
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
+      _real_graph_function(nullptr),
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
+      _prefix(),
+      _timer_enabled(false)
 {
     auto                        q     = CLScheduler::get().queue();
     cl_command_queue_properties props = q.getInfo<CL_QUEUE_PROPERTIES>();
@@ -91,19 +97,17 @@ void           OpenCLClock<output_timestamps>::test_start()
 {
     // Start intercepting enqueues:
     ARM_COMPUTE_ERROR_ON(_real_function != nullptr);
-    ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr);
-    _real_function       = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
-    _real_graph_function = graph::TaskExecutor::get().execute_function;
-    auto interceptor     = [this](
-                               cl_command_queue command_queue,
-                               cl_kernel        kernel,
-                               cl_uint          work_dim,
-                               const size_t    *gwo,
-                               const size_t    *gws,
-                               const size_t    *lws,
-                               cl_uint          num_events_in_wait_list,
-                               const cl_event * event_wait_list,
-                               cl_event *       event)
+    _real_function   = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+    auto interceptor = [this](
+                           cl_command_queue command_queue,
+                           cl_kernel        kernel,
+                           cl_uint          work_dim,
+                           const size_t    *gwo,
+                           const size_t    *gws,
+                           const size_t    *lws,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event)
     {
         if(this->_timer_enabled)
         {
@@ -138,7 +142,11 @@ void           OpenCLClock<output_timestamps>::test_start()
             return this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, event);
         }
     };
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
 
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
+    ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr);
+    _real_graph_function = graph::TaskExecutor::get().execute_function;
     // Start intercepting tasks:
     auto task_interceptor = [this](graph::ExecutionTask & task)
     {
@@ -153,9 +161,8 @@ void           OpenCLClock<output_timestamps>::test_start()
         this->_real_graph_function(task);
         this->_prefix = "";
     };
-
-    CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
     graph::TaskExecutor::get().execute_function = task_interceptor;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
 }
 
 template <bool output_timestamps>
@@ -175,9 +182,11 @@ void           OpenCLClock<output_timestamps>::test_stop()
 {
     // Restore real function
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = _real_function;
+    _real_function                              = nullptr;
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
     graph::TaskExecutor::get().execute_function = _real_graph_function;
     _real_graph_function                        = nullptr;
-    _real_function                              = nullptr;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
 }
 
 template <bool              output_timestamps>
diff --git a/tests/framework/instruments/OpenCLTimer.h b/tests/framework/instruments/OpenCLTimer.h
index 9904035c20..1812272435 100644
--- a/tests/framework/instruments/OpenCLTimer.h
+++ b/tests/framework/instruments/OpenCLTimer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,9 +67,11 @@ private:
     };
     std::list<kernel_info>                          _kernels;
     std::function<decltype(clEnqueueNDRangeKernel)> _real_function;
-    std::function<decltype(graph::execute_task)>    _real_graph_function;
-    std::string                                     _prefix;
-    bool                                            _timer_enabled;
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
+    std::function<decltype(graph::execute_task)> _real_graph_function;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
+    std::string _prefix;
+    bool        _timer_enabled;
 #endif /* ARM_COMPUTE_CL */
 
 private:
diff --git a/tests/framework/instruments/SchedulerTimer.cpp b/tests/framework/instruments/SchedulerTimer.cpp
index c31cd42d19..b753485351 100644
--- a/tests/framework/instruments/SchedulerTimer.cpp
+++ b/tests/framework/instruments/SchedulerTimer.cpp
@@ -26,6 +26,7 @@
 #include "Instruments.h"
 #include "WallClockTimer.h"
 #include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/graph/DataLayerVisitor.h"
 #include "arm_compute/graph/INode.h"
 #include "support/Cast.h"
 
@@ -53,8 +54,10 @@ class Interceptor final : public IScheduler
 {
 public:
     /** Default constructor. */
-    Interceptor(std::list<struct SchedulerClock<output_timestamps>::kernel_info> &kernels, IScheduler &real_scheduler, ScaleFactor scale_factor)
-        : _kernels(kernels), _real_scheduler(real_scheduler), _timer(scale_factor), _prefix()
+    Interceptor(std::list<struct SchedulerClock<output_timestamps>::kernel_info> &kernels,
+                std::map<std::string, SchedulerTimer::LayerData> &layers, IScheduler &real_scheduler,
+                ScaleFactor scale_factor)
+        : _kernels(kernels), _layer_data_map(layers), _real_scheduler(real_scheduler), _timer(scale_factor), _prefix()
     {
     }
 
@@ -126,14 +129,24 @@ protected:
 
 private:
     std::list<struct SchedulerClock<output_timestamps>::kernel_info> &_kernels;
-    IScheduler                                                       &_real_scheduler;
-    WallClock<output_timestamps>                                      _timer;
-    std::string                                                       _prefix;
+    std::map<std::string, SchedulerTimer::LayerData> &_layer_data_map;
+    IScheduler                  &_real_scheduler;
+    WallClock<output_timestamps> _timer;
+    std::string                  _prefix;
 };
 
 template <bool output_timestamps>
 SchedulerClock<output_timestamps>::SchedulerClock(ScaleFactor scale_factor)
-    : _kernels(), _real_scheduler(nullptr), _real_scheduler_type(), _real_graph_function(nullptr), _scale_factor(scale_factor), _interceptor(nullptr), _scheduler_users()
+    : _kernels(),
+      _layer_data_map(),
+      _real_scheduler(nullptr),
+      _real_scheduler_type(),
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
+      _real_graph_function(nullptr),
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
+      _scale_factor(scale_factor),
+      _interceptor(nullptr),
+      _scheduler_users()
 {
     if(instruments_info != nullptr)
     {
@@ -144,6 +157,7 @@ SchedulerClock<output_timestamps>::SchedulerClock(ScaleFactor scale_factor)
 template <bool output_timestamps>
 void           SchedulerClock<output_timestamps>::test_start()
 {
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
     // Start intercepting tasks:
     ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr);
     _real_graph_function  = graph::TaskExecutor::get().execute_function;
@@ -156,6 +170,13 @@ void           SchedulerClock<output_timestamps>::test_start()
             if(task.node != nullptr && !task.node->name().empty())
             {
                 scheduler->set_prefix(task.node->name() + "/");
+
+                if(_layer_data_map.find(task.node->name()) == _layer_data_map.end())
+                {
+                    arm_compute::graph::DataLayerVisitor dlv = {};
+                    task.node->accept(dlv);
+                    _layer_data_map[task.node->name()] = dlv.layer_data();
+                }
             }
             else
             {
@@ -170,6 +191,7 @@ void           SchedulerClock<output_timestamps>::test_start()
             scheduler->set_prefix("");
         }
     };
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
 
     ARM_COMPUTE_ERROR_ON(_real_scheduler != nullptr);
     _real_scheduler_type = Scheduler::get_type();
@@ -177,9 +199,11 @@ void           SchedulerClock<output_timestamps>::test_start()
     if(_real_scheduler_type != Scheduler::Type::CUSTOM)
     {
         _real_scheduler = &Scheduler::get();
-        _interceptor    = std::make_shared<Interceptor<output_timestamps>>(_kernels, *_real_scheduler, _scale_factor);
+        _interceptor    = std::make_shared<Interceptor<output_timestamps>>(_kernels, _layer_data_map, *_real_scheduler, _scale_factor);
         Scheduler::set(std::static_pointer_cast<IScheduler>(_interceptor));
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
         graph::TaskExecutor::get().execute_function = task_interceptor;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
 
         // Create an interceptor for each scheduler
         // TODO(COMPID-2638) : Allow multiple schedulers, now it assumes the same scheduler is used.
@@ -188,7 +212,7 @@ void           SchedulerClock<output_timestamps>::test_start()
         {
             if(user != nullptr && user->scheduler() != nullptr)
             {
-                user->intercept_scheduler(std::make_unique<Interceptor<output_timestamps>>(_kernels, *user->scheduler(), _scale_factor));
+                user->intercept_scheduler(std::make_unique<Interceptor<output_timestamps>>(_kernels, _layer_data_map, *user->scheduler(), _scale_factor));
             }
         });
     }
@@ -205,10 +229,12 @@ void           SchedulerClock<output_timestamps>::test_stop()
 {
     // Restore real scheduler
     Scheduler::set(_real_scheduler_type);
-    _real_scheduler                             = nullptr;
-    _interceptor                                = nullptr;
+    _real_scheduler = nullptr;
+    _interceptor    = nullptr;
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
     graph::TaskExecutor::get().execute_function = _real_graph_function;
     _real_graph_function                        = nullptr;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
 
     // Restore schedulers
     std::for_each(std::begin(_scheduler_users), std::end(_scheduler_users),
@@ -257,6 +283,36 @@ Instrument::MeasurementsMap SchedulerClock<output_timestamps>::measurements() co
     return measurements;
 }
 
+template <bool output_timestamps>
+std::string    SchedulerClock<output_timestamps>::instrument_header() const
+{
+    std::string output{ "" };
+    output += R"("layer_data" : {)";
+    for(auto i_it = _layer_data_map.cbegin(), i_end = _layer_data_map.cend(); i_it != i_end; ++i_it)
+    {
+        output += "\"" + i_it->first + "\" : {";
+        if(i_it->second.size() != 0)
+        {
+            // Print for each entry in layer
+            for(auto entry_it = i_it->second.cbegin(), entry_end = i_it->second.cend(); entry_it != entry_end; ++entry_it)
+            {
+                output += "\"" + entry_it->first + "\" : \"" + entry_it->second + "\"";
+                if(std::next(entry_it) != entry_end)
+                {
+                    output += ",";
+                }
+            }
+        }
+        output += "}";
+        if(std::next(i_it) != i_end)
+        {
+            output += ",";
+        }
+    }
+    output += "}";
+    return output;
+}
+
 } // namespace framework
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/framework/instruments/SchedulerTimer.h b/tests/framework/instruments/SchedulerTimer.h
index aa948d32a6..c437f2717c 100644
--- a/tests/framework/instruments/SchedulerTimer.h
+++ b/tests/framework/instruments/SchedulerTimer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,6 +63,7 @@ template <bool output_timestamps>
 class SchedulerClock : public Instrument
 {
 public:
+    using LayerData = std::map<std::string, std::string>;
     /** Construct a Scheduler timer.
      *
      * @param[in] scale_factor Measurement scale factor.
@@ -85,6 +86,7 @@ public:
     void                        start() override;
     void                        test_stop() override;
     Instrument::MeasurementsMap measurements() const override;
+    std::string                 instrument_header() const override;
 
     /** Kernel information */
     struct kernel_info
@@ -95,13 +97,16 @@ public:
     };
 
 private:
-    std::list<kernel_info>                       _kernels;
-    IScheduler                                  *_real_scheduler;
-    Scheduler::Type                              _real_scheduler_type;
+    std::list<kernel_info> _kernels;
+    std::map<std::string, LayerData> _layer_data_map;
+    IScheduler     *_real_scheduler;
+    Scheduler::Type _real_scheduler_type;
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
     std::function<decltype(graph::execute_task)> _real_graph_function;
-    ScaleFactor                                  _scale_factor;
-    std::shared_ptr<IScheduler>                  _interceptor;
-    std::vector<ISchedulerUser *>                _scheduler_users;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
+    ScaleFactor                   _scale_factor;
+    std::shared_ptr<IScheduler>   _interceptor;
+    std::vector<ISchedulerUser *> _scheduler_users;
 };
 
 using SchedulerTimer      = SchedulerClock<false>;
diff --git a/tests/framework/instruments/hwc_names.hpp b/tests/framework/instruments/hwc_names.hpp
index e68bcbed82..c39f3bba7a 100644
--- a/tests/framework/instruments/hwc_names.hpp
+++ b/tests/framework/instruments/hwc_names.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,7 @@ enum
 /*
      * "Short names" for hardware counters used by Streamline. Counters names are
      * stored in accordance with their memory layout in the binary counter block
-     * emitted by the Mali GPU. Each "master" in the GPU emits a fixed-size block
+     * emitted by the Arm® Mali™ GPU. Each "master" in the GPU emits a fixed-size block
      * of 64 counters, and each GPU implements the same set of "masters" although
      * the counters each master exposes within its block of 64 may vary.
      *
diff --git a/tests/framework/printers/JSONPrinter.cpp b/tests/framework/printers/JSONPrinter.cpp
index 0995ff3594..84b2d23114 100644
--- a/tests/framework/printers/JSONPrinter.cpp
+++ b/tests/framework/printers/JSONPrinter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -169,10 +169,18 @@ void JSONPrinter::print_info(const std::string &info)
     _infos.push_back(info);
 }
 
+void JSONPrinter::print_profiler_header(const std::string &header_data)
+{
+    if(header_data.size() > 0)
+    {
+        print_separator(_first_test_entry);
+    }
+    *_stream << header_data;
+}
+
 void JSONPrinter::print_measurements(const Profiler::MeasurementsMap &measurements)
 {
     print_separator(_first_test_entry);
-
     *_stream << R"("measurements" : {)";
 
     for(auto i_it = measurements.cbegin(), i_end = measurements.cend(); i_it != i_end;)
diff --git a/tests/framework/printers/JSONPrinter.h b/tests/framework/printers/JSONPrinter.h
index ce587ade04..ad996708e7 100644
--- a/tests/framework/printers/JSONPrinter.h
+++ b/tests/framework/printers/JSONPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,6 +51,7 @@ public:
     void print_errors_footer() override;
     void print_error(const std::exception &error, bool expected) override;
     void print_info(const std::string &info) override;
+    void print_profiler_header(const std::string &header_data) override;
     void print_measurements(const Profiler::MeasurementsMap &measurements) override;
     void print_list_tests(const std::vector<TestInfo> &infos) override;
 
diff --git a/tests/framework/printers/PrettyPrinter.cpp b/tests/framework/printers/PrettyPrinter.cpp
index aa06eb9b4e..529ff2c2d9 100644
--- a/tests/framework/printers/PrettyPrinter.cpp
+++ b/tests/framework/printers/PrettyPrinter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -116,6 +116,12 @@ void PrettyPrinter::print_list_tests(const std::vector<TestInfo> &infos)
         *_stream << "[" << info.id << ", " << info.mode << ", " << info.status << "] " << info.name << "\n";
     }
 }
+
+void PrettyPrinter::print_profiler_header(const std::string &header_data)
+{
+    ARM_COMPUTE_UNUSED(header_data);
+}
+
 void PrettyPrinter::print_measurements(const Profiler::MeasurementsMap &measurements)
 {
     for(const auto &instrument : measurements)
diff --git a/tests/framework/printers/PrettyPrinter.h b/tests/framework/printers/PrettyPrinter.h
index ded0da04d8..b9d5d3957c 100644
--- a/tests/framework/printers/PrettyPrinter.h
+++ b/tests/framework/printers/PrettyPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,6 +55,7 @@ public:
     void print_errors_footer() override;
     void print_error(const std::exception &error, bool expected) override;
     void print_info(const std::string &info) override;
+    void print_profiler_header(const std::string &header_data) override;
     void print_measurements(const Profiler::MeasurementsMap &measurements) override;
     void print_list_tests(const std::vector<TestInfo> &infos) override;
 
diff --git a/tests/framework/printers/Printer.h b/tests/framework/printers/Printer.h
index 669b7f6a95..af0209788c 100644
--- a/tests/framework/printers/Printer.h
+++ b/tests/framework/printers/Printer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -125,6 +125,12 @@ public:
      */
     virtual void print_info(const std::string &info) = 0;
 
+    /** Print header data.
+     *
+     * @param[in] header_data JSON formmated header data.
+     */
+    virtual void print_profiler_header(const std::string &header_data) = 0;
+
     /** Print measurements for a test.
      *
      * @param[in] measurements Measurements as collected by a @ref Profiler.
diff --git a/tests/main.cpp b/tests/main.cpp
index cecd89bdb7..e862c7627e 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/Version.h"
 #include "support/StringSupport.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/framework/DatasetModes.h"
@@ -45,6 +46,7 @@
 #include "utils/TypePrinter.h"
 #endif /* ARM_COMPUTE_CL */
 #include "arm_compute/runtime/Scheduler.h"
+#include "src/common/cpuinfo/CpuModel.h"
 
 #include <fstream>
 #include <initializer_list>
@@ -115,7 +117,7 @@ int main(int argc, char **argv)
     auto filter_id = parser.add_option<utils::SimpleOption<std::string>>("filter-id");
     filter_id->set_help("List of test ids. ... can be used to define a range.");
     auto stop_on_error = parser.add_option<utils::ToggleOption>("stop-on-error");
-    stop_on_error->set_help("Abort execution after the first failed test (useful for debugging)");
+    stop_on_error->set_help("Stop execution after the first failed test (useful for debugging)");
     auto seed = parser.add_option<utils::SimpleOption<std::random_device::result_type>>("seed", std::random_device()());
     seed->set_help("Global seed for random number generation");
     auto list_tests = parser.add_option<utils::ToggleOption>("list-tests", false);
@@ -126,6 +128,8 @@ int main(int argc, char **argv)
     error_on_missing_assets->set_help("Mark a test as failed instead of skipping it when assets are missing");
     auto assets = parser.add_positional_option<utils::SimpleOption<std::string>>("assets");
     assets->set_help("Path to the assets directory");
+    auto print_rerun_command = parser.add_option<utils::ToggleOption>("rerun-cmd");
+    print_rerun_command->set_help("Print out the command to rerun the exact failed testcase");
 #ifdef ARM_COMPUTE_CL
     auto enable_tuner = parser.add_option<utils::ToggleOption>("enable-tuner");
     enable_tuner->set_help("Enable OpenCL dynamic tuner");
@@ -183,7 +187,7 @@ int main(int argc, char **argv)
         CLGEMMHeuristicsHandle gemm_heuristics;
         if(opencl_is_available())
         {
-            auto ctx_dev_err = create_opencl_context_and_device();
+            auto ctx_dev_err = create_opencl_context_and_device(CLBackendType::Native);
             ARM_COMPUTE_ERROR_ON_MSG(std::get<2>(ctx_dev_err) != CL_SUCCESS, "Failed to create OpenCL context");
             gemm_heuristics.reload_from_file(mlgo_file->value());
             CLScheduler::get().default_init_with_context(std::get<1>(ctx_dev_err), std::get<0>(ctx_dev_err), &cl_tuner, &gemm_heuristics);
@@ -235,13 +239,23 @@ int main(int argc, char **argv)
 #endif /* ARM_COMPUTE_CL */
                 const arm_compute::CPUInfo &cpu_info = Scheduler::get().cpu_info();
                 const unsigned int          num_cpus = cpu_info.get_cpu_num();
+
+                p->print_entry("cpu_has_sve", support::cpp11::to_string(cpu_info.has_sve()));
+                p->print_entry("cpu_has_sve2", support::cpp11::to_string(cpu_info.has_sve2()));
+                p->print_entry("cpu_has_svef32mm", support::cpp11::to_string(cpu_info.has_svef32mm()));
+                p->print_entry("cpu_has_svei8mm", support::cpp11::to_string(cpu_info.has_svei8mm()));
+                p->print_entry("cpu_has_svebf16", support::cpp11::to_string(cpu_info.has_svebf16()));
+                p->print_entry("cpu_has_sme", support::cpp11::to_string(cpu_info.has_sme()));
+                p->print_entry("cpu_has_sme2", support::cpp11::to_string(cpu_info.has_sme2()));
                 p->print_entry("cpu_has_fp16", support::cpp11::to_string(cpu_info.has_fp16()));
+                p->print_entry("cpu_has_bf16", support::cpp11::to_string(cpu_info.has_bf16()));
                 p->print_entry("cpu_has_dotprod", support::cpp11::to_string(cpu_info.has_dotprod()));
+                p->print_entry("cpu_has_i8mm", support::cpp11::to_string(cpu_info.has_i8mm()));
 
                 for(unsigned int j = 0; j < num_cpus; ++j)
                 {
                     const CPUModel model = cpu_info.get_cpu_model(j);
-                    p->print_entry("CPU" + support::cpp11::to_string(j), cpu_model_to_string(model));
+                    p->print_entry("CPU" + support::cpp11::to_string(j), cpuinfo::cpu_model_to_string(model));
                 }
                 p->print_entry("Iterations", support::cpp11::to_string(options.iterations->value()));
                 p->print_entry("Threads", support::cpp11::to_string(threads->value()));
@@ -259,14 +273,16 @@ int main(int argc, char **argv)
 
         // Initialize framework
         framework::FrameworkConfig fconfig;
-        fconfig.instruments    = options.instruments->value();
-        fconfig.name_filter    = filter->value();
-        fconfig.id_filter      = filter_id->value();
-        fconfig.num_iterations = options.iterations->value();
-        fconfig.mode           = dataset_mode->value();
-        fconfig.log_level      = options.log_level->value();
-        fconfig.cooldown_sec   = cooldown_sec->value();
-        fconfig.configure_only = configure_only->value();
+        fconfig.instruments     = options.instruments->value();
+        fconfig.name_filter     = filter->value();
+        fconfig.id_filter       = filter_id->value();
+        fconfig.num_iterations  = options.iterations->value();
+        fconfig.mode            = dataset_mode->value();
+        fconfig.log_level       = options.log_level->value();
+        fconfig.cooldown_sec    = cooldown_sec->value();
+        fconfig.configure_only  = configure_only->value();
+        fconfig.print_rerun_cmd = print_rerun_command->value();
+        fconfig.seed            = seed->value();
         framework.init(fconfig);
 
         for(auto &p : printers)
diff --git a/tests/validate_examples/RunExample.cpp b/tests/validate_examples/RunExample.cpp
index 8ba3c2b295..36bf587551 100644
--- a/tests/validate_examples/RunExample.cpp
+++ b/tests/validate_examples/RunExample.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "ValidateExample.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/Scheduler.h"
+#include "arm_compute/core/Version.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/framework/Framework.h"
@@ -150,7 +151,16 @@ int run_example(int argc, char **argv, std::unique_ptr<ValidateExample> example)
 #ifdef ARM_COMPUTE_CL
     if(opencl_is_available())
     {
-        auto ctx_dev_err = create_opencl_context_and_device();
+        CLBackendType backend_type = CLBackendType::Native;
+        for(auto &arg : example_args->value())
+        {
+            if(arg.find("--target=clvk") != std::string::npos)
+            {
+                backend_type = CLBackendType::Clvk;
+                break;
+            }
+        }
+        auto ctx_dev_err = create_opencl_context_and_device(backend_type);
         ARM_COMPUTE_ERROR_ON_MSG(std::get<2>(ctx_dev_err) != CL_SUCCESS, "Failed to create OpenCL context");
         CLScheduler::get().default_init_with_context(std::get<1>(ctx_dev_err), std::get<0>(ctx_dev_err), nullptr);
     }
diff --git a/tests/validate_examples/cl_gemm.cpp b/tests/validate_examples/cl_gemm.cpp
index 717ba77e17..8189b228c2 100644
--- a/tests/validate_examples/cl_gemm.cpp
+++ b/tests/validate_examples/cl_gemm.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,20 +32,19 @@
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "src/core/CL/kernels/CLIm2ColKernel.h"
-#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "src/gpu/cl/kernels/ClCastKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClIm2ColKernel.h"
+#include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/Globals.h"
@@ -205,7 +204,11 @@ public:
             mm_gemmlowp.configure(&src0, &src1, nullptr, &tmp_dst);
 
             // Configure GEMMlowp output stage
-            mm_gemmlowp_output_stage.configure(&tmp_dst, add_bias ? &biases : nullptr, &dst, dst_multiplier, dst_shift, offset_dst);
+            GEMMLowpOutputStageInfo gemm_info{};
+            gemm_info.gemmlowp_multiplier = dst_multiplier;
+            gemm_info.gemmlowp_shift      = dst_shift;
+            gemm_info.gemmlowp_offset     = offset_dst;
+            mm_gemmlowp_output_stage.configure(&tmp_dst, add_bias ? &biases : nullptr, &dst, gemm_info);
             tmp_dst.allocator()->allocate();
             biases.allocator()->allocate();
             fill(CLAccessor(biases), 3);
@@ -393,9 +396,9 @@ private:
     CLTensor src0{}, src1{}, src2{}, dst{};
     CLTensor tmp_dst{}, biases{};
 
-    CLGEMM                                              mm_gemm{};
-    CLGEMMLowpMatrixMultiplyCore                        mm_gemmlowp{};
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint mm_gemmlowp_output_stage{};
+    CLGEMM                       mm_gemm{};
+    CLGEMMLowpMatrixMultiplyCore mm_gemmlowp{};
+    CLGEMMLowpOutputStage        mm_gemmlowp_output_stage{};
 
     size_t   M{ 7 }, N{ 3 }, K{ 5 }, B{ 1 };
     DataType data_type{ DataType::F32 };
diff --git a/tests/validate_examples/graph_validate_utils.h b/tests/validate_examples/graph_validate_utils.h
index f6f47cc2c3..c1a83d1f40 100644
--- a/tests/validate_examples/graph_validate_utils.h
+++ b/tests/validate_examples/graph_validate_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef GRAPH_VALIDATE_UTILS_H
-#define GRAPH_VALIDATE_UTILS_H
+#ifndef ACL_TESTS_VALIDATE_EXAMPLES_GRAPH_VALIDATE_UTILS_H
+#define ACL_TESTS_VALIDATE_EXAMPLES_GRAPH_VALIDATE_UTILS_H
 
 #include "arm_compute/graph.h"
 
@@ -240,7 +240,6 @@ public:
         {
             arm_compute::graph::Target::NEON,
             arm_compute::graph::Target::CL,
-            arm_compute::graph::Target::GC,
         };
 
         const std::set<arm_compute::DataType> supported_data_types
@@ -288,7 +287,6 @@ public:
      * @param[out] os            Output stream.
      * @param[in]  common_params Example parameters to output
      *
-     * @return None.
      */
     virtual void print_parameters(::std::ostream &os, const ExampleParams &common_params)
     {
@@ -310,8 +308,6 @@ public:
  *
  * @param[in]  options       Options to consume
  * @param[out] common_params params structure to consume.
- *
- * @return consume_common_graph_parameters structure containing the common graph parameters
  */
 void consume_common_graph_parameters(CommonGraphValidateOptions &options, CommonParams &common_params)
 {
@@ -397,7 +393,6 @@ public:
      * @param[out] bias    The tensor with the bias data.
      * @param[in]  tensor  Tensor result of the actual operation passed into the Accessor.
      *
-     * @return None.
      */
     virtual void create_tensors(arm_compute::test::SimpleTensor<D>     &src,
                                 arm_compute::test::SimpleTensor<D>     &weights,
@@ -446,8 +441,6 @@ public:
      * @param[in]  seed   seed for the randomization function
      * @param[in]  low    lower bound for random values
      * @param[in]  high   upper bound for random values
-     *
-     * @return None.
      */
     void fill_tensor(arm_compute::test::SimpleTensor<uint8_t> &tensor, std::random_device::result_type seed, uint8_t low, uint8_t high)
     {
@@ -474,8 +467,6 @@ public:
      * @param[in]  seed   seed for the randomization function
      * @param[in]  low    lower bound for random values
      * @param[in]  high   upper bound for random values
-     *
-     * @return None.
      */
     void fill_tensor(arm_compute::test::SimpleTensor<int32_t> &tensor, std::random_device::result_type seed, int32_t low, int32_t high)
     {
@@ -495,8 +486,6 @@ public:
      * @param[in]  seed   seed for the randomization function
      * @param[in]  low    lower bound for random values
      * @param[in]  high   upper bound for random values
-     *
-     * @return None.
      */
     void fill_tensor(arm_compute::test::SimpleTensor<float> &tensor, std::random_device::result_type seed, float low, float high)
     {
@@ -517,8 +506,6 @@ public:
      * @param[in]  seed   seed for the randomization function
      * @param[in]  low    lower bound for random values
      * @param[in]  high   upper bound for random values
-     *
-     * @return None.
      */
     void fill_tensor(arm_compute::test::SimpleTensor<half> &tensor, std::random_device::result_type seed, half low, half high)
     {
@@ -560,8 +547,6 @@ public:
      *
      * @param[in] tensor Tensor result of the actual operation passed into the Accessor.
      * @param[in] output Tensor result of the reference implementation.
-     *
-     * @return None.
      */
     void validate(ITensor &tensor, arm_compute::test::SimpleTensor<D> output)
     {
@@ -693,4 +678,4 @@ public:
 
 } // graph_validate_utils
 } // arm_compute
-#endif //GRAPH_VALIDATE_UTILS_H
+#endif // ACL_TESTS_VALIDATE_EXAMPLES_GRAPH_VALIDATE_UTILS_H
diff --git a/tests/validation/CL/AbsLayer.cpp b/tests/validation/CL/AbsLayer.cpp
index e6ba14b50e..0bad8f9b68 100644
--- a/tests/validation/CL/AbsLayer.cpp
+++ b/tests/validation/CL/AbsLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/ActivationLayer.cpp b/tests/validation/CL/ActivationLayer.cpp
index fa95594157..133b39d154 100644
--- a/tests/validation/CL/ActivationLayer.cpp
+++ b/tests/validation/CL/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,6 +70,7 @@ AbsoluteTolerance<float> tolerance(ActivationLayerInfo::ActivationFunction activ
         case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
         case ActivationLayerInfo::ActivationFunction::ELU:
         case ActivationLayerInfo::ActivationFunction::SQRT:
+        case ActivationLayerInfo::ActivationFunction::GELU:
             return AbsoluteTolerance<float>(data_type == DataType::F16 ? 0.01f : 0.00001f);
         case ActivationLayerInfo::ActivationFunction::TANH:
             return AbsoluteTolerance<float>(data_type == DataType::F16 ? 0.001f : 0.00001f);
diff --git a/tests/validation/CL/ArgMinMax.cpp b/tests/validation/CL/ArgMinMax.cpp
index 1d849ed0c7..8566972f81 100644
--- a/tests/validation/CL/ArgMinMax.cpp
+++ b/tests/validation/CL/ArgMinMax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,15 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
-#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/ShapeDatasets.h"
-#include "tests/datasets/SplitDataset.h"
-#include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ArgMinMaxFixture.h"
@@ -46,6 +42,8 @@ namespace
 const auto ArgMinMaxSmallDataset = framework::dataset::make("Shape",
 {
     TensorShape{ 1U, 7U, 1U, 3U },
+    TensorShape{ 3U, 1U, 3U, 2U },
+    TensorShape{ 2U, 1U, 3U, 2U },
     TensorShape{ 149U, 5U, 1U, 2U },
     TensorShape{ 166U, 5U, 1U, 2U },
     TensorShape{ 322U, 5U, 1U, 2U },
@@ -53,6 +51,22 @@ const auto ArgMinMaxSmallDataset = framework::dataset::make("Shape",
     TensorShape{ 2560, 2U, 2U, 2U },
 });
 
+const auto ArgMinMaxSmallDatasetAxis0 = framework::dataset::make("Shape",
+{
+    TensorShape{ 1U, 5U },
+    TensorShape{ 2U, 3U },
+    TensorShape{ 1U },
+    TensorShape{ 3U },
+    TensorShape{ 2U },
+    TensorShape{ 5U },
+    TensorShape{ 17U },
+    TensorShape{ 15U, 2U },
+});
+
+const auto OpsDataset   = framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX });
+const auto AxisDataset  = framework::dataset::make("Axis", { 0, 1, 2, 3 });
+const auto QInfoDataset = framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) });
+
 const auto ArgMinMaxLargeDataset = framework::dataset::make("Shape",
 { TensorShape{ 517U, 123U, 13U, 2U } });
 } // namespace
@@ -85,47 +99,78 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-template <typename T>
-using CLArgMinMaxValidationFixture = ArgMinMaxValidationFixture<CLTensor, CLAccessor, CLArgMinMaxLayer, T>;
+template <typename T1, typename T2>
+using CLArgMinMaxValidationFixture = ArgMinMaxValidationFixture<CLTensor, CLAccessor, CLArgMinMaxLayer, T1, T2>;
+
+using CLArgMinMaxValidationFixture_S32_S32 = CLArgMinMaxValidationFixture<int32_t, int32_t>;
+using CLArgMinMaxValidationFixture_F16_S32 = CLArgMinMaxValidationFixture<half, int32_t>;
+using CLArgMinMaxValidationFixture_F32_S32 = CLArgMinMaxValidationFixture<float, int32_t>;
+using CLArgMinMaxValidationFixture_F32_S64 = CLArgMinMaxValidationFixture<float, int64_t>;
 
 TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmallAxis0,
+                       CLArgMinMaxValidationFixture_S32_S32,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(ArgMinMaxSmallDatasetAxis0,
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       framework::dataset::make("Axis", { 0 })),
+                               OpsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       CLArgMinMaxValidationFixture<int32_t>,
+                       CLArgMinMaxValidationFixture_S32_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(ArgMinMaxSmallDataset, framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                               framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset,
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       CLArgMinMaxValidationFixture<int32_t>,
+                       CLArgMinMaxValidationFixture_S32_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(ArgMinMaxLargeDataset, framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                               framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset,
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+
 TEST_SUITE_END() // S32
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       CLArgMinMaxValidationFixture<half>,
+                       CLArgMinMaxValidationFixture_F16_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(ArgMinMaxSmallDataset, framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                               framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset,
+                                                       framework::dataset::make("DataTypeIn", DataType::F16)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       CLArgMinMaxValidationFixture<half>,
+                       CLArgMinMaxValidationFixture_F16_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(ArgMinMaxLargeDataset, framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                               framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset,
+                                                       framework::dataset::make("DataTypeIn", DataType::F16)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -134,49 +179,77 @@ TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       CLArgMinMaxValidationFixture<float>,
+                       CLArgMinMaxValidationFixture_F32_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(ArgMinMaxSmallDataset, framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                               framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset,
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall_F32_S64,
+                       CLArgMinMaxValidationFixture_F32_S64,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset,
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S64)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       CLArgMinMaxValidationFixture<float>,
+                       CLArgMinMaxValidationFixture_F32_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(ArgMinMaxLargeDataset, framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                               framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset,
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
-template <typename T>
-using CLArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture<CLTensor, CLAccessor, CLArgMinMaxLayer, T>;
+template <typename T1, typename T2>
+using CLArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture<CLTensor, CLAccessor, CLArgMinMaxLayer, T1, T2>;
+
+using CLArgMinMaxQuantizedValidationFixture_U8_S32 = CLArgMinMaxQuantizedValidationFixture<uint8_t, int32_t>;
+using CLArgMinMaxQuantizedValidationFixture_S8_S32 = CLArgMinMaxQuantizedValidationFixture<int8_t, int32_t>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       CLArgMinMaxQuantizedValidationFixture<uint8_t>,
+                       CLArgMinMaxQuantizedValidationFixture_U8_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(ArgMinMaxSmallDataset, framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxSmallDataset,
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       CLArgMinMaxQuantizedValidationFixture<uint8_t>,
+                       CLArgMinMaxQuantizedValidationFixture_U8_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(ArgMinMaxLargeDataset, framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxLargeDataset,
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -185,28 +258,32 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       CLArgMinMaxQuantizedValidationFixture<int8_t>,
+                       CLArgMinMaxQuantizedValidationFixture_S8_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(ArgMinMaxSmallDataset, framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxSmallDataset,
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       CLArgMinMaxQuantizedValidationFixture<int8_t>,
+                       CLArgMinMaxQuantizedValidationFixture_S8_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(ArgMinMaxLargeDataset, framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxLargeDataset,
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
-
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // ArgMinMax
 TEST_SUITE_END() // CL
diff --git a/tests/validation/CL/ArithmeticAddition.cpp b/tests/validation/CL/ArithmeticAddition.cpp
index c74f6a3b23..1ed3a105dc 100644
--- a/tests/validation/CL/ArithmeticAddition.cpp
+++ b/tests/validation/CL/ArithmeticAddition.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,26 +41,12 @@ namespace test
 {
 namespace validation
 {
+/** Synced with tests/validation/dynamic_fusion/gpu/cl/Add.cpp from the dynamic fusion interface.
+ * Please check there for any differences in the coverage
+ */
 namespace
 {
 /** Input data sets **/
-const auto ArithmeticAdditionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
-                                                 DataType::U8));
-const auto ArithmeticAdditionQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                      framework::dataset::make("DataType",
-                                                                               DataType::QASYMM8));
-const auto ArithmeticAdditionQASYMM8SignedDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                            framework::dataset::make("DataType",
-                                                                                     DataType::QASYMM8_SIGNED));
-const auto ArithmeticAdditionQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16), framework::dataset::make("DataType", DataType::QSYMM16)),
-                                                      framework::dataset::make("DataType",
-                                                                               DataType::QSYMM16));
-const auto ArithmeticAdditionS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
-                                                  framework::dataset::make("DataType", DataType::S16));
-const auto ArithmeticAdditionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
-                                                   framework::dataset::make("DataType", DataType::F16));
-const auto ArithmeticAdditionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
-                                                   framework::dataset::make("DataType", DataType::F32));
 const auto EmptyActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 { ActivationLayerInfo() });
 const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
@@ -68,6 +54,8 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.75f, 0.25f),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.75f, 0.25f)
 });
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(CL)
@@ -76,22 +64,19 @@ TEST_SUITE(ArithmeticAddition)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
-               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false})),
+               framework::dataset::make("Expected", { true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLArithmeticAddition::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), ConvertPolicy::WRAP)) == expected, framework::LogLevel::ERRORS);
@@ -129,8 +114,10 @@ using CLArithmeticAdditionFixture = ArithmeticAdditionValidationFixture<CLTensor
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionU8Dataset),
-                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::U8)),
+                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                  OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -138,15 +125,19 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFixture<uint8_t>, framework
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionS16Dataset),
-                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::S16)),
+                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                  OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticAdditionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticAdditionS16Dataset),
-                                                                                                                framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticAdditionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                        DataType::S16)),
+                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -159,48 +150,65 @@ using CLArithmeticAdditionQuantizedFixture = ArithmeticAdditionValidationQuantiz
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                       ArithmeticAdditionQASYMM8Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 template <typename T>
 using CLArithmeticAdditionBroadcastQuantizedFixture = ArithmeticAdditionValidationQuantizedBroadcastFixture<CLTensor, CLAccessor, CLArithmeticAddition, T>;
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticAdditionBroadcastQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ArithmeticAdditionQASYMM8Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticAdditionBroadcastQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, CLArithmeticAdditionBroadcastQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 255.f, 10) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(1.f / 255.f, 10) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 10) })),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                       ArithmeticAdditionQASYMM8SignedDataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 10) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE(QSYMM16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                       ArithmeticAdditionQSYMM16Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QSYMM16)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })))
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -213,16 +221,21 @@ using CLArithmeticAdditionFloatFixture = ArithmeticAdditionValidationFloatFixtur
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP16Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                      DataType::F16)),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                              EmptyActivationFunctionsDataset))
+                                                                                                                      EmptyActivationFunctionsDataset),
+                                                                                                              OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticAdditionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ArithmeticAdditionFP16Dataset),
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticAdditionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::TinyShapes(),
+                                                                                                                       framework::dataset::make("DataType",
+                                                                                                                               DataType::F16)),
                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                       ActivationFunctionsDataset))
+                                                                                                                       ActivationFunctionsDataset),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -230,24 +243,32 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticAdditionFloatFixture<half>
 TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                             DataType::F32)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     EmptyActivationFunctionsDataset))
+                                                                                                                     EmptyActivationFunctionsDataset),
+                                                                                                                     InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticAdditionFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ArithmeticAdditionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticAdditionFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::TinyShapes(),
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::F32)),
                                                                                                                         framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                        ActivationFunctionsDataset))
+                                                                                                                        ActivationFunctionsDataset),
+                                                                                                                        OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticAdditionFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticAdditionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticAdditionFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::F32)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                   EmptyActivationFunctionsDataset))
+                                                                                                                   EmptyActivationFunctionsDataset),
+                                                                                                                   OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -256,27 +277,30 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticAdditionFloatFixture<float>, framew
 template <typename T>
 using CLArithmeticAdditionBroadcastFloatFixture = ArithmeticAdditionBroadcastValidationFloatFixture<CLTensor, CLAccessor, CLArithmeticAddition, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticAdditionBroadcastFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ArithmeticAdditionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticAdditionBroadcastFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLArithmeticAdditionBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapesBroadcast(),
-                       ArithmeticAdditionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLArithmeticAdditionBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::TinyShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                       ActivationFunctionsDataset))
+                       ActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, CLArithmeticAdditionBroadcastFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapesBroadcast(),
-                       ArithmeticAdditionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, CLArithmeticAdditionBroadcastFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/CL/ArithmeticDivision.cpp b/tests/validation/CL/ArithmeticDivision.cpp
index 36567dc02a..94bacba7e5 100644
--- a/tests/validation/CL/ArithmeticDivision.cpp
+++ b/tests/validation/CL/ArithmeticDivision.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,13 +51,16 @@ const auto ArithmeticDivisionFP16Dataset = combine(combine(framework::dataset::m
                                                    framework::dataset::make("DataType", DataType::F16));
 const auto ArithmeticDivisionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                    framework::dataset::make("DataType", DataType::F32));
-const auto EmptyActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
-{ ActivationLayerInfo() });
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ArithmeticDivisionS32Dataset = combine(combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::S32)),
+                                                  framework::dataset::make("DataType", DataType::S32));
+const auto EmptyActivationFunctionsDataset = framework::dataset::make("ActivationInfo", { ActivationLayerInfo() });
+const auto ActivationFunctionsDataset      = framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.75f, 0.25f),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.75f, 0.25f)
 });
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(CL)
@@ -89,19 +92,44 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
+using CLArithmeticDivisionIntegerFixture = ArithmeticDivisionValidationIntegerFixture<CLTensor, CLAccessor, CLArithmeticDivision, int>;
+
+TEST_SUITE(Integer)
+TEST_SUITE(S32)
+
+FIXTURE_DATA_TEST_CASE(RunSmallInteger, CLArithmeticDivisionIntegerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticDivisionS32Dataset),
+                                                                                                                       EmptyActivationFunctionsDataset),
+                                                                                                                       InPlaceDataSet))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunIntegerWithActivation, CLArithmeticDivisionIntegerFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticDivisionS32Dataset),
+                       ActivationFunctionsDataset),
+                       InPlaceDataSet))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+
 template <typename T>
 using CLArithmeticDivisionFloatFixture = ArithmeticDivisionValidationFloatFixture<CLTensor, CLAccessor, CLArithmeticDivision, T>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticDivisionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticDivisionFP16Dataset),
-                                                                                                              EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticDivisionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticDivisionFP16Dataset),
+                                                                                                                      EmptyActivationFunctionsDataset),
+                                                                                                              InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticDivisionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), ArithmeticDivisionFP16Dataset),
-                                                                                                                       ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticDivisionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ArithmeticDivisionFP16Dataset),
+                                                                                                                       ActivationFunctionsDataset),
+                                                                                                                       InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
@@ -109,21 +137,24 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticDivisionFloatFixture<half>
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticDivisionFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticDivisionFP32Dataset),
-                                                                                                                     EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticDivisionFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticDivisionFP32Dataset),
+                                                                                                                     EmptyActivationFunctionsDataset),
+                                                                                                                     InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticDivisionFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), ArithmeticDivisionFP32Dataset),
-                                                                                                                        ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticDivisionFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ArithmeticDivisionFP32Dataset),
+                                                                                                                        ActivationFunctionsDataset),
+                                                                                                                        InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticDivisionFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticDivisionFP32Dataset),
-                                                                                                                   EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticDivisionFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticDivisionFP32Dataset),
+                                                                                                                   EmptyActivationFunctionsDataset),
+                                                                                                                   InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -132,24 +163,27 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticDivisionFloatFixture<float>, framew
 template <typename T>
 using CLArithmeticDivisionBroadcastFloatFixture = ArithmeticDivisionBroadcastValidationFloatFixture<CLTensor, CLAccessor, CLArithmeticDivision, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticDivisionBroadcastFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticDivisionBroadcastFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
                        ArithmeticDivisionFP32Dataset),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLArithmeticDivisionBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLArithmeticDivisionBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapesBroadcast(),
                        ArithmeticDivisionFP32Dataset),
-                       ActivationFunctionsDataset))
+                       ActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, CLArithmeticDivisionBroadcastFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, CLArithmeticDivisionBroadcastFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapesBroadcast(),
                        ArithmeticDivisionFP32Dataset),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
diff --git a/tests/validation/CL/ArithmeticSubtraction.cpp b/tests/validation/CL/ArithmeticSubtraction.cpp
index 2709fcaedb..5825ce2e5d 100644
--- a/tests/validation/CL/ArithmeticSubtraction.cpp
+++ b/tests/validation/CL/ArithmeticSubtraction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,27 +41,12 @@ namespace test
 {
 namespace validation
 {
+/** Synced with tests/validation/dynamic_fusion/gpu/cl/Sub.cpp from the dynamic fusion interface.
+ * Please check there for any differences in the coverage
+ */
 namespace
 {
 /** Input data sets **/
-const auto ArithmeticSubtractionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)),
-                                                    framework::dataset::make("DataType",
-                                                                             DataType::U8));
-const auto ArithmeticSubtractionQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                         framework::dataset::make("DataType",
-                                                                                  DataType::QASYMM8));
-const auto ArithmeticSubtractionQASYMM8SignedDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8_SIGNED));
-const auto ArithmeticSubtractionQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16), framework::dataset::make("DataType", DataType::QSYMM16)),
-                                                         framework::dataset::make("DataType",
-                                                                                  DataType::QSYMM16));
-const auto ArithmeticSubtractionS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
-                                                     framework::dataset::make("DataType", DataType::S16));
-const auto ArithmeticSubtractionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
-                                                      framework::dataset::make("DataType", DataType::F16));
-const auto ArithmeticSubtractionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
-                                                      framework::dataset::make("DataType", DataType::F32));
 const auto EmptyActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 { ActivationLayerInfo() });
 const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
@@ -79,22 +64,19 @@ TEST_SUITE(ArithmeticSubtraction)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
-               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false})),
+               framework::dataset::make("Expected", { true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLArithmeticSubtraction::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), ConvertPolicy::WRAP)) == expected, framework::LogLevel::ERRORS);
@@ -159,7 +141,8 @@ using CLArithmeticSubtractionFixture = ArithmeticSubtractionValidationFixture<CL
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionU8Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                       DataType::U8)),
                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                OutOfPlaceDataSet))
 {
@@ -169,7 +152,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<uint8_t>, framew
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS16Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::S16)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                      OutOfPlaceDataSet))
 {
@@ -177,7 +161,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<int16_t>, framew
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS16Dataset),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::S16)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                    OutOfPlaceDataSet))
 {
@@ -193,11 +178,22 @@ using CLArithmeticSubtractionQuantizedFixture = ArithmeticSubtractionValidationQ
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                       ArithmeticSubtractionQASYMM8Dataset),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
                        framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                       OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyInPlace, CLArithmeticSubtractionQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::TinyShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                       framework::dataset::make("Src1QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                        InPlaceDataSet))
 {
     // Validate output
@@ -206,12 +202,12 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<uint8_t
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                       ArithmeticSubtractionQASYMM8SignedDataset),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 10) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
                        framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
-                       InPlaceDataSet))
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -219,7 +215,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<int8_t>
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE(QSYMM16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                       ArithmeticSubtractionQSYMM16Dataset),
+                       framework::dataset::make("DataType", DataType::QSYMM16)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
@@ -237,7 +233,8 @@ using CLArithmeticSubtractionFloatFixture = ArithmeticSubtractionValidationFloat
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F16)),
                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                  EmptyActivationFunctionsDataset),
                                                                                                                  OutOfPlaceDataSet))
@@ -246,7 +243,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFloatFixture<half>, fram
     validate(CLAccessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticSubtractionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::TinyShapes(),
-                       ArithmeticSubtractionFP16Dataset),
+                       framework::dataset::make("DataType", DataType::F16)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                        ActivationFunctionsDataset),
                        InPlaceDataSet))
@@ -258,7 +255,7 @@ TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                        ArithmeticSubtractionFP32Dataset),
+                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                         framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                         EmptyActivationFunctionsDataset),
                                                                                                                         OutOfPlaceDataSet))
@@ -267,7 +264,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFloatFixture<float>, fra
     validate(CLAccessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticSubtractionFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::TinyShapes(),
-                       ArithmeticSubtractionFP32Dataset),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                        ActivationFunctionsDataset),
                        InPlaceDataSet))
@@ -277,7 +274,7 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticSubtractionFloatFixture<fl
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticSubtractionFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                      ArithmeticSubtractionFP32Dataset),
+                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                       EmptyActivationFunctionsDataset),
                                                                                                                       OutOfPlaceDataSet))
@@ -290,7 +287,7 @@ template <typename T>
 using CLArithmeticSubtractionBroadcastFloatFixture = ArithmeticSubtractionBroadcastValidationFloatFixture<CLTensor, CLAccessor, CLArithmeticSubtraction, T>;
 
 FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticSubtractionBroadcastFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ArithmeticSubtractionFP32Dataset),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                        EmptyActivationFunctionsDataset),
                        OutOfPlaceDataSet))
@@ -298,8 +295,18 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticSubtractionBroadcastFloatF
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInplace, CLArithmeticSubtractionBroadcastFloatFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                       framework::dataset::make("DataType", DataType::F32)),
+                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                       EmptyActivationFunctionsDataset),
+                               InPlaceDataSet))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
 FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLArithmeticSubtractionBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::TinyShapesBroadcast(),
-                       ArithmeticSubtractionFP32Dataset),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                        ActivationFunctionsDataset),
                        OutOfPlaceDataSet))
@@ -309,7 +316,7 @@ FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLArithmeticSubtractionBroadc
 }
 
 FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, CLArithmeticSubtractionBroadcastFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapesBroadcast(),
-                       ArithmeticSubtractionFP32Dataset),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                        EmptyActivationFunctionsDataset),
                        OutOfPlaceDataSet))
diff --git a/tests/validation/CL/BatchNormalizationLayer.cpp b/tests/validation/CL/BatchNormalizationLayer.cpp
index 8b3bdbc3ea..3b87b9d1b5 100644
--- a/tests/validation/CL/BatchNormalizationLayer.cpp
+++ b/tests/validation/CL/BatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,7 @@ namespace
 {
 RelativeTolerance<float>           rel_tolerance_f32(0.05f);   /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 constexpr AbsoluteTolerance<float> abs_tolerance_f32(0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);       /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr AbsoluteTolerance<float> tolerance_f16(0.02f);       /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 const auto                         act_infos = framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
diff --git a/tests/validation/CL/BatchToSpaceLayer.cpp b/tests/validation/CL/BatchToSpaceLayer.cpp
index e90ac921c5..ca12b76e8a 100644
--- a/tests/validation/CL/BatchToSpaceLayer.cpp
+++ b/tests/validation/CL/BatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,56 +50,38 @@ using CLBatchToSpaceLayerFixture = BatchToSpaceLayerValidationFixture<CLTensor,
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blockx > blocky
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blocky > blockx
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),     // Mismatching data types
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),     // Wrong data type block shape
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U, 4U), 1, DataType::F32), // Wrong tensor shape
-                                                     }),
-               framework::dataset::make("BlockShapeInfo",{ TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 4U), 1, DataType::S32),
-                                                      TensorInfo(TensorShape(4U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                     })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(64U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 32U, 2U, 1U), 1, DataType::F32),
-
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                     })),
-               framework::dataset::make("Expected", { true, true,true, false, false, false})),
-               input_info, block_shape_info, output_info, expected)
-{
-    bool has_error = bool(CLBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), &block_shape_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false)));
-    ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
-}
-DATA_TEST_CASE(ValidateStatic, framework::DatasetMode::ALL, zip(zip(zip(zip(
+DATA_TEST_CASE(ValidateStatic, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blockx > blocky
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blocky > blockx
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),    // Mismatching data types
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),    // Negative block shapes
-                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U, 4U), 1, DataType::F32), // Wrong tensor shape
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: blockx != blocky && blockx > blocky
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: blockx != blocky && blocky > blockx
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),     // Invalid: Mismatching data types
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),     // Invalid: Negative block shapes
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U, 4U), 1, DataType::F32),// Unsupported tensor rank
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid output tensor shape (invalid batch dimension)
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid output tensor shape (invalid spatial dimension)
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: correct tensor shape with cropping
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid tensor shape with cropping
                                                      }),
-               framework::dataset::make("BlockShapeX", { 2, 4, 2, 2, 2, 2 })),
-               framework::dataset::make("BlockShapeY", { 2, 2, 4, 2, -2, 2 })),
+               framework::dataset::make("BlockShapeX", { 2, 4, 2, 2, 2, 2, 2, 2, 2, 2 })),
+               framework::dataset::make("BlockShapeY", { 2, 2, 4, 2, -2, 2, 2, 2, 2, 2 })),
+               framework::dataset::make("CropInfo", {
+                CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{3, 2, 1, 3}, CropInfo{3, 2, 1, 3}
+               })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(64U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 32U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 16U, 2U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 32U, 2U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F16),
                                                        TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 8U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(33U, 32U, 2U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(27, 12U, 2U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true,true, false, false, false})),
-               input_info, block_shape_x, block_shape_y, output_info, expected)
+               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true, false})),
+               input_info, block_shape_x, block_shape_y, crop_info, output_info, expected)
 {
-    bool has_error = bool(CLBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), block_shape_x, block_shape_y, &output_info.clone()->set_is_resizable(false)));
+    bool has_error = bool(CLBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), block_shape_x, block_shape_y, &output_info.clone()->set_is_resizable(false), crop_info));
     ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -114,6 +96,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLBatchToSpaceLayerFixture<float>, framework::D
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallWithCropping, CLBatchToSpaceLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallBatchToSpaceLayerWithCroppingDataset(), framework::dataset::make("DataType",
+                                                                                                                       DataType::F32)),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, CLBatchToSpaceLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeBatchToSpaceLayerDataset(), framework::dataset::make("DataType",
                                                                                                                      DataType::F32)),
                                                                                                              framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
@@ -131,6 +123,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLBatchToSpaceLayerFixture<half>, framework::Da
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallWithCropping, CLBatchToSpaceLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallBatchToSpaceLayerWithCroppingDataset(), framework::dataset::make("DataType",
+                                                                                                                       DataType::F16)),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, CLBatchToSpaceLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeBatchToSpaceLayerDataset(), framework::dataset::make("DataType",
                                                                                                                     DataType::F16)),
                                                                                                             framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
diff --git a/tests/validation/CL/BoundingBoxTransform.cpp b/tests/validation/CL/BoundingBoxTransform.cpp
index 2a7f1667d6..2584b1a5b6 100644
--- a/tests/validation/CL/BoundingBoxTransform.cpp
+++ b/tests/validation/CL/BoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
 #include "tests/CL/CLAccessor.h"
-#include "tests/CL/CLArrayAccessor.h"
 #include "tests/Globals.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
diff --git a/tests/validation/CL/Cast.cpp b/tests/validation/CL/Cast.cpp
index 2ca8b58040..2f943e84d8 100644
--- a/tests/validation/CL/Cast.cpp
+++ b/tests/validation/CL/Cast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,6 +48,10 @@ constexpr AbsoluteTolerance<float> one_tolerance(1);
 constexpr AbsoluteTolerance<float> zero_tolerance(0);
 
 /** Input data sets **/
+// QASYMM8
+const auto CastQASYMM8toF32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::F32));
+const auto CastQSYMM8toF32Dataset = combine(framework::dataset::make("DataType", DataType::QSYMM8), framework::dataset::make("DataType", DataType::F32));
+
 // U8
 const auto CastU8toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S8));
 const auto CastU8toU16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U16));
@@ -119,6 +123,26 @@ const auto CastF32toS16Dataset = combine(framework::dataset::make("DataType", Da
 const auto CastF32toU32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U32));
 const auto CastF32toS32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S32));
 const auto CastF32toF16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
+
+// U64
+const auto CastU64toU8Dataset  = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U8));
+const auto CastU64toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S8));
+const auto CastU64toU16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U16));
+const auto CastU64toS16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S16));
+const auto CastU64toU32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U32));
+const auto CastU64toS32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S32));
+const auto CastU64toF16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F16));
+const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32));
+
+// S64
+const auto CastS64toU8Dataset  = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U8));
+const auto CastS64toS8Dataset  = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S8));
+const auto CastS64toU16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U16));
+const auto CastS64toS16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S16));
+const auto CastS64toU32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U32));
+const auto CastS64toS32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S32));
+const auto CastS64toF16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F16));
+const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32));
 } // namespace
 
 TEST_SUITE(CL)
@@ -149,6 +173,12 @@ using CLCastToF32Fixture = CastValidationFixture<CLTensor, CLAccessor, CLCast, T
     }                                                                                                                            \
     TEST_SUITE_END()
 
+// QASYMM8
+CAST_SUITE(QASYMM8_to_F32, DataType::QASYMM8, DataType::F32, CLCastToF32Fixture<uint8_t>, CastQASYMM8toF32Dataset, zero_tolerance)
+// QSYMM8
+CAST_SUITE(QSYMM8_to_F32, DataType::QSYMM8, DataType::F32, CLCastToF32Fixture<int8_t>, CastQSYMM8toF32Dataset, zero_tolerance)
+
+
 // U8
 CAST_SUITE(U8_to_S8, DataType::U8, DataType::S8, CLCastToS8Fixture<uint8_t>, CastU8toS8Dataset, zero_tolerance)
 CAST_SUITE(U8_to_U16, DataType::U8, DataType::U16, CLCastToU16Fixture<uint8_t>, CastU8toU16Dataset, zero_tolerance)
@@ -221,6 +251,26 @@ CAST_SUITE(F32_to_U32, DataType::F32, DataType::U32, CLCastToU32Fixture<float>,
 CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, CLCastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
 CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, CLCastToF16Fixture<float>, CastF32toF16Dataset, zero_tolerance)
 
+// S64
+CAST_SUITE(S64_to_U8, DataType::S64, DataType::U8, CLCastToU8Fixture<int64_t>, CastS64toU8Dataset, one_tolerance)
+CAST_SUITE(S64_to_S8, DataType::S64, DataType::S8, CLCastToS8Fixture<int64_t>, CastS64toS8Dataset, one_tolerance)
+CAST_SUITE(S64_to_U16, DataType::S64, DataType::U16, CLCastToU16Fixture<int64_t>, CastS64toU16Dataset, one_tolerance)
+CAST_SUITE(S64_to_S16, DataType::S64, DataType::S16, CLCastToS16Fixture<int64_t>, CastS64toS16Dataset, one_tolerance)
+CAST_SUITE(S64_to_U32, DataType::S64, DataType::U32, CLCastToU32Fixture<int64_t>, CastS64toU32Dataset, one_tolerance)
+CAST_SUITE(S64_to_S32, DataType::S64, DataType::S32, CLCastToS32Fixture<int64_t>, CastS64toS32Dataset, one_tolerance)
+CAST_SUITE(S64_to_F16, DataType::S64, DataType::F16, CLCastToF16Fixture<int64_t>, CastS64toF16Dataset, zero_tolerance)
+CAST_SUITE(S64_to_F32, DataType::S64, DataType::F32, CLCastToF32Fixture<int64_t>, CastS64toF32Dataset, zero_tolerance)
+
+// U64
+CAST_SUITE(U64_to_U8, DataType::U64, DataType::U8, CLCastToU8Fixture<uint64_t>, CastU64toU8Dataset, one_tolerance)
+CAST_SUITE(U64_to_S8, DataType::U64, DataType::S8, CLCastToS8Fixture<uint64_t>, CastU64toS8Dataset, one_tolerance)
+CAST_SUITE(U64_to_U16, DataType::U64, DataType::U16, CLCastToU16Fixture<uint64_t>, CastU64toU16Dataset, one_tolerance)
+CAST_SUITE(U64_to_S16, DataType::U64, DataType::S16, CLCastToS16Fixture<uint64_t>, CastU64toS16Dataset, one_tolerance)
+CAST_SUITE(U64_to_U32, DataType::U64, DataType::U32, CLCastToU32Fixture<uint64_t>, CastU64toU32Dataset, one_tolerance)
+CAST_SUITE(U64_to_S32, DataType::U64, DataType::S32, CLCastToS32Fixture<uint64_t>, CastU64toS32Dataset, one_tolerance)
+CAST_SUITE(U64_to_F16, DataType::U64, DataType::F16, CLCastToF16Fixture<uint64_t>, CastU64toF16Dataset, zero_tolerance)
+CAST_SUITE(U64_to_F32, DataType::U64, DataType::F32, CLCastToF32Fixture<uint64_t>, CastU64toF32Dataset, zero_tolerance)
+
 TEST_SUITE_END() // Cast
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/CL/Col2Im.cpp b/tests/validation/CL/Col2Im.cpp
index b651bf8918..4b004e2472 100644
--- a/tests/validation/CL/Col2Im.cpp
+++ b/tests/validation/CL/Col2Im.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLCol2ImKernel.h"
+#include "src/gpu/cl/kernels/ClCol2ImKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/framework/Asserts.h"
@@ -40,7 +40,7 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(Col2Im)
 
-using CLCol2Im = CLSynthetizeFunction<CLCol2ImKernel>;
+using ClCol2Im = ClSynthetizeOperatorWithBorder<opencl::kernels::ClCol2ImKernel>;
 
 /** Negative tests
  *
@@ -59,7 +59,7 @@ TEST_CASE(Negative, framework::DatasetMode::ALL)
         const auto input     = TensorInfo(TensorShape(10U, 12U, 1U, 2U), 1, DataType::SIZET);
         const auto output    = TensorInfo(TensorShape(3U, 4U, 10U, 1U, 2U), 1, DataType::F32);
         const auto conv_size = Size2D(3, 4);
-        const auto status    = CLCol2ImKernel::validate(&input, &output, conv_size);
+        const auto status    = opencl::kernels::ClCol2ImKernel::validate(&input, &output, conv_size);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
 
@@ -68,7 +68,7 @@ TEST_CASE(Negative, framework::DatasetMode::ALL)
         const auto input     = TensorInfo(TensorShape(10U, 12U, 1U, 2U), 1, DataType::F32);
         const auto output    = TensorInfo(TensorShape(3U, 4U, 10U, 1U, 2U), 1, DataType::F32, DataLayout::NHWC);
         const auto conv_size = Size2D(3, 4);
-        const auto status    = CLCol2ImKernel::validate(&input, &output, conv_size);
+        const auto status    = opencl::kernels::ClCol2ImKernel::validate(&input, &output, conv_size);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
 
@@ -77,13 +77,13 @@ TEST_CASE(Negative, framework::DatasetMode::ALL)
         const auto input     = TensorInfo(TensorShape(10U, 12U, 1U, 2U), 1, DataType::F32);
         const auto output    = TensorInfo(TensorShape(3U, 4U, 10U, 2U, 2U), 1, DataType::F32);
         const auto conv_size = Size2D(3, 4);
-        const auto status    = CLCol2ImKernel::validate(&input, &output, conv_size);
+        const auto status    = opencl::kernels::ClCol2ImKernel::validate(&input, &output, conv_size);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
 }
 
 template <typename T>
-using CLCol2ImFixture = Col2ImValidationFixture<CLTensor, CLAccessor, CLCol2Im, T, true>;
+using ClCol2ImFixture = Col2ImOpValidationFixture<CLTensor, CLAccessor, ClCol2Im, T, true>;
 
 /** Test kernel for single-precision floating point
  *
@@ -99,7 +99,7 @@ using CLCol2ImFixture = Col2ImValidationFixture<CLTensor, CLAccessor, CLCol2Im,
  *  Kernel tested col2im
  */
 FIXTURE_DATA_TEST_CASE(FP32,
-                       CLCol2ImFixture<float>,
+                       ClCol2ImFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(
                                                    framework::dataset::make("InputShape", { TensorShape(8U, 16U, 3U, 1U), TensorShape(17U, 16U, 3U, 1U), TensorShape(7U, 16U, 3U, 1U) }),
@@ -125,7 +125,7 @@ FIXTURE_DATA_TEST_CASE(FP32,
  *  Kernel tested col2im
  */
 FIXTURE_DATA_TEST_CASE(F16,
-                       CLCol2ImFixture<half>,
+                       ClCol2ImFixture<half>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(
                                                    framework::dataset::make("InputShape", TensorShape(17U, 16U, 3U, 1U)),
@@ -151,7 +151,7 @@ FIXTURE_DATA_TEST_CASE(F16,
  *  Kernel tested col2im
  */
 FIXTURE_DATA_TEST_CASE(QASYMM8,
-                       CLCol2ImFixture<uint8_t>,
+                       ClCol2ImFixture<uint8_t>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(
                                                    framework::dataset::make("InputShape", TensorShape(17U, 16U, 3U, 1U)),
diff --git a/tests/validation/CL/Comparisons.cpp b/tests/validation/CL/Comparisons.cpp
index d015528b0e..dd3dbd8d59 100644
--- a/tests/validation/CL/Comparisons.cpp
+++ b/tests/validation/CL/Comparisons.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,7 +59,7 @@ TEST_SUITE(Comparison)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
         framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Invalid output type
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Mismatching input types
-                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Window shrink
+                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Mismatching shapes
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
         }),
@@ -75,7 +75,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
         })),
-        framework::dataset::make("Expected", { false, false, false, false, true})),
+        framework::dataset::make("Expected", { false, false, true, false, true})),
         input1_info, input2_info, output_info, expected)
 {
     Status s = CLComparison::validate(&input1_info.clone()->set_is_resizable(false),
diff --git a/tests/validation/CL/Convolution3D.cpp b/tests/validation/CL/Convolution3D.cpp
new file mode 100644
index 0000000000..a2848560c3
--- /dev/null
+++ b/tests/validation/CL/Convolution3D.cpp
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLConv3D.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/DirectConvolution3DFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+const RelativeTolerance<half>        rel_tolerance_fp16(half(0.2)); /**< Relative tolerance for FP16 tests */
+constexpr float                      abs_tolerance_fp16(0.05f);     /**< Absolute tolerance for FP16 tests */
+constexpr RelativeTolerance<float>   rel_tolerance_fp32(0.05f);     /**< Relative tolerance for FP32 tests */
+constexpr float                      abs_tolerance_fp32(0.0001f);   /**< Absolute tolerance for FP32 tests*/
+constexpr AbsoluteTolerance<uint8_t> abs_tolerance_qasymm8(1);      /**< Absolute tolerance for quantized tests */
+constexpr float                      tolerance_num = 0.07f;         /**< Tolerance number */
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DirectConvolution3D)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 5U, 3U), // Unsupported data layout
+                                                        TensorShape(27U, 13U, 5U, 3U), // Unsupported activation enabled
+                                                        TensorShape(27U, 13U, 5U, 3U), // Mismatching data type
+                                                        TensorShape(27U, 13U, 5U, 3U), // Unsupported data type
+                                                        TensorShape(27U, 13U, 5U, 3U), // Mismatching input feature maps
+                                                        TensorShape(27U, 13U, 5U, 3U), // Mismatching output feature maps
+                                                        TensorShape(27U, 13U, 5U, 3U), // Mismatching bias shape
+                                                        TensorShape(27U, 13U, 5U, 3U), // Unsupported number of weights dimensions
+                                                        TensorShape(27U, 13U, 5U, 3U), // Unsupported number of biases dimensions
+                                                        TensorShape(27U, 13U, 5U, 3U), // Mismatching output shape
+                                                        TensorShape(27U, 13U, 5U, 3U)
+                                                     }),
+               framework::dataset::make("WeightsShape", { TensorShape(4U, 27U, 3U, 3U, 3U),
+                                                          TensorShape(4U, 27U, 3U, 3U, 3U),
+                                                          TensorShape(4U, 27U, 3U, 3U, 3U),
+                                                          TensorShape(4U, 27U, 3U, 3U, 3U),
+                                                          TensorShape(4U, 32U, 3U, 3U, 3U),
+                                                          TensorShape(8U, 27U, 3U, 3U, 3U),
+                                                          TensorShape(4U, 27U, 3U, 3U, 3U),
+                                                          TensorShape(4U, 27U, 3U, 3U, 3U, 2U),
+                                                          TensorShape(4U, 27U, 3U, 3U, 3U),
+                                                          TensorShape(4U, 27U, 3U, 3U, 3U),
+                                                          TensorShape(4U, 27U, 3U, 3U, 3U)
+                                                     })),
+               framework::dataset::make("BiasesShape", { TensorShape(4U),
+                                                         TensorShape(4U),
+                                                         TensorShape(4U),
+                                                         TensorShape(4U),
+                                                         TensorShape(4U),
+                                                         TensorShape(4U),
+                                                         TensorShape(8U),
+                                                         TensorShape(4U),
+                                                         TensorShape(4U),
+                                                         TensorShape(4U),
+                                                         TensorShape(4U)
+                                                     })),
+               framework::dataset::make("OutputShape", { TensorShape(4U, 13U, 5U, 3U),
+                                                         TensorShape(4U, 13U, 5U, 3U),
+                                                         TensorShape(4U, 13U, 5U, 3U),
+                                                         TensorShape(4U, 13U, 5U, 3U),
+                                                         TensorShape(4U, 13U, 5U, 3U),
+                                                         TensorShape(4U, 13U, 5U, 3U),
+                                                         TensorShape(4U, 13U, 5U, 3U),
+                                                         TensorShape(4U, 13U, 5U, 3U),
+                                                         TensorShape(4U, 13U, 5U, 3U, 2U),
+                                                         TensorShape(4U, 11U, 5U, 3U),
+                                                         TensorShape(4U, 13U, 5U, 3U)
+                                                     })),
+               framework::dataset::make("Conv3dInfo",  { Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false),
+                                                         Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false),
+                                                         Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false),
+                                                         Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false),
+                                                         Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false),
+                                                         Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false),
+                                                         Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false),
+                                                         Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false),
+                                                         Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false),
+                                                         Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false),
+                                                         Conv3dInfo(Size3D(1U, 1U, 1U), Padding3D(1U, 1U, 1U), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false)
+                                                      })),
+                framework::dataset::make("SrcDataType", { DataType::F32,
+                                                          DataType::F32,
+                                                          DataType::F32,
+                                                          DataType::U32,
+                                                          DataType::F32,
+                                                          DataType::F32,
+                                                          DataType::F32,
+                                                          DataType::F32,
+                                                          DataType::F32,
+                                                          DataType::F32,
+                                                          DataType::F32
+                                                      })),
+                framework::dataset::make("WeightsDataType", { DataType::F32,
+                                                              DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::U32,
+                                                              DataType::F32,
+                                                              DataType::F32,
+                                                              DataType::F32,
+                                                              DataType::F32,
+                                                              DataType::F32,
+                                                              DataType::F32,
+                                                              DataType::F32
+                                                      })),
+                framework::dataset::make("DataLayout", { DataLayout::NCDHW,
+                                                         DataLayout::NDHWC,
+                                                         DataLayout::NDHWC,
+                                                         DataLayout::NDHWC,
+                                                         DataLayout::NDHWC,
+                                                         DataLayout::NDHWC,
+                                                         DataLayout::NDHWC,
+                                                         DataLayout::NDHWC,
+                                                         DataLayout::NDHWC,
+                                                         DataLayout::NDHWC,
+                                                         DataLayout::NDHWC
+                                                      })),
+               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
+               input_shape, weights_shape, biases_shape, output_shape, conv3d_info, src_data_type, weights_data_type, data_layout, expected)
+{
+    TensorInfo input_info   = TensorInfo(input_shape, 1, src_data_type);
+    TensorInfo weights_info = TensorInfo(weights_shape, 1, weights_data_type);
+    TensorInfo biases_info  = TensorInfo(biases_shape, 1, src_data_type);
+    TensorInfo output_info  = TensorInfo(output_shape, 1, src_data_type);
+
+    input_info.set_data_layout(data_layout);
+    weights_info.set_data_layout(data_layout);
+    biases_info.set_data_layout(data_layout);
+    output_info.set_data_layout(data_layout);
+
+    bool is_valid = bool(CLConv3D::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv3d_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+
+template <typename T>
+using CLDirectConvolution3DFixture = DirectConvolution3DValidationFixture<CLTensor, CLAccessor, CLConv3D, T>;
+template <typename T>
+using CLDirectConvolution3DQuantizedFixture = DirectConvolution3DValidationQuantizedFixture<CLTensor, CLAccessor, CLConv3D, T>;
+
+TEST_SUITE(NDHWC)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolution3DFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                       framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                TensorShape(15U, 7U, 11U, 7U),
+                                                                TensorShape(19U, 5U, 16U, 4U),
+                                                                TensorShape(13U, 5U, 17U, 2U)
+                                                              }),
+                       framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                       framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                       framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                       framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                       framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                       framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                       framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                       framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                       framework::dataset::make("HasBias", { true, true, true, false })),
+                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                       framework::dataset::make("DataType", DataType::F16)),
+                       framework::dataset::make("DataLayout", DataLayout::NDHWC)))
+{
+    validate(CLAccessor(_target), _reference, rel_tolerance_fp16, tolerance_num, abs_tolerance_fp16);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolution3DFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                       framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                TensorShape(15U, 7U, 11U, 7U),
+                                                                TensorShape(19U, 5U, 16U, 4U),
+                                                                TensorShape(13U, 5U, 17U, 2U)
+                                                              }),
+                       framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                       framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                       framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                       framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                       framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                       framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                       framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                       framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                       framework::dataset::make("HasBias", { true, true, true, false })),
+                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("DataLayout", DataLayout::NDHWC)))
+{
+    validate(CLAccessor(_target), _reference, rel_tolerance_fp32, 0.0, abs_tolerance_fp32);
+}
+
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolution3DQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                                                                           TensorShape(15U, 7U, 11U, 7U),
+                                                                                                                           TensorShape(19U, 5U, 16U, 4U),
+                                                                                                                           TensorShape(13U, 5U, 17U, 2U)
+                                                                                                                                                          }),
+                                                                                                                   framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                                                                                                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                                                                                                   framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                                                                                               framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                                                                                           framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                                                                                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                                                                                   framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                                                                               framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                                                                           framework::dataset::make("HasBias", { true, true, true, false })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(CLAccessor(_target), _reference, abs_tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolution3DQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                                                                           TensorShape(15U, 7U, 11U, 7U),
+                                                                                                                           TensorShape(19U, 5U, 16U, 4U),
+                                                                                                                           TensorShape(13U, 5U, 17U, 2U)
+                                                                                                                                                          }),
+                                                                                                                   framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                                                                                                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                                                                                                   framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                                                                                               framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                                                                                           framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                                                                                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                                                                                   framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                                                                               framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                                                                           framework::dataset::make("HasBias", { true, true, true, false })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(CLAccessor(_target), _reference, abs_tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // NDHWC
+TEST_SUITE_END() // DirectConvolution3D
+TEST_SUITE_END() // CL
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index b66cfd97e7..8820a6a31e 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
@@ -37,12 +38,16 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ConvolutionLayerFixture.h"
 
+/** Synced with tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
+ *  Please check there for any differences in the coverage
+ */
 namespace arm_compute
 {
 namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 class SmallConvolutionLayerDatasetCases final : public datasets::ConvolutionLayerDataset
@@ -61,32 +66,32 @@ constexpr AbsoluteTolerance<float>  tolerance_qasymm8(1);                 /**< T
 constexpr float                     tolerance_num = 0.07f;                /**< Tolerance number */
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
     DataType::F16,
-    DataType::F32,
-    DataType::QASYMM8,
-    DataType::QASYMM8_SIGNED,
+             DataType::F32,
+             DataType::QASYMM8,
+             DataType::QASYMM8_SIGNED,
 });
 
 /** Grouped CNN data types */
-const auto GroupedCNNDataTypes = framework::dataset::make("DataType",
+const auto GroupedCNNDataTypes = make("DataType",
 {
     DataType::F16,
-    DataType::F32
+             DataType::F32
 });
 
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsDataset      = make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
 });
-const auto ActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsSmallDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
 });
 } // namespace
 
@@ -96,7 +101,7 @@ TEST_SUITE(ConvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-                                          framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),            // Select GEMM
+                                          make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),            // Select GEMM
                                                                                   TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),            // Select GEMM
                                                                                   TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32),        // Select GEMM
                                                                                   TensorInfo(TensorShape(23U, 27U, 31U, 4U), 1, DataType::F32),       // Select WINOGRAD
@@ -106,7 +111,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                                   TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),            // Select GEMM
                                                                                   TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::QASYMM8_SIGNED), // Select GEMM
                                           }),
-                                          framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32),
+                                          make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(3U, 3U, 31U, 21U), 1, DataType::F32),
@@ -116,7 +121,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                                     TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::QASYMM8_SIGNED),
                                           })),
-                                          framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
+                                          make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
@@ -126,7 +131,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                                    TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::QASYMM8_SIGNED),
                                           })),
-                                          framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
+                                          make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
                                                                                  PadStrideInfo(1, 2, 1, 1),
                                                                                  PadStrideInfo(1, 1, 0, 0),
                                                                                  PadStrideInfo(1, 1, 0, 0),
@@ -136,7 +141,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                                  PadStrideInfo(1, 1, 2, 2),
                                                                                  PadStrideInfo(1, 1, 2, 2),
                                           })),
-                                          framework::dataset::make("GpuTarget", { GPUTarget::BIFROST,
+                                          make("GpuTarget", { GPUTarget::BIFROST,
                                                                                   GPUTarget::MIDGARD,
                                                                                   GPUTarget::G71,
                                                                                   GPUTarget::G71,
@@ -146,7 +151,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                                   GPUTarget::BIFROST,
                                                                                   GPUTarget::BIFROST,
                                           })),
-                                          framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                                          make("Dilation", { Size2D(1U, 1U),
                                                                  Size2D(1U, 1U),
                                                                  Size2D(1U, 1U),
                                                                  Size2D(1U, 1U),
@@ -156,8 +161,8 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                  Size2D(2U, 1U),
                                                                  Size2D(2U, 1U),
                                           })),
-                                         framework::dataset::make("EnableFastMath", { false, false, false, false, false, false, true, true, true })),
-                                         framework::dataset::make("Expected",{ ConvolutionMethod::GEMM,
+                                         make("EnableFastMath", { false, false, false, false, false, false, true, true, true })),
+                                         make("Expected",{ ConvolutionMethod::GEMM,
                                                                                ConvolutionMethod::GEMM,
                                                                                ConvolutionMethod::GEMM,
                                                                                ConvolutionMethod::WINOGRAD,
@@ -186,15 +191,16 @@ TEST_SUITE_END() // ConvolutionLayer
 TEST_SUITE(GEMMConvolutionLayer)
 template <typename T>
 using CLGEMMConvolutionLayerFixture = ConvolutionValidationFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
+template <typename T>
+using CLGEMMConvolutionLayerMixedDataLayoutFixture = ConvolutionValidationFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T, true>;
+template <typename T>
+using CLConvolutionValidationWithPaddingFixture = ConvolutionValidationWithPaddingFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                   framework::dataset::make("DataType",
-                                                                                                                           DataType::F16)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                   make("ReshapeWeights", { true })), make("DataType", DataType::F16)), make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                            ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -205,93 +211,231 @@ TEST_SUITE_END() // FP16
 TEST_SUITE(FP32)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                    framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::F32)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    make("ReshapeWeights", { true })), make("DataType", DataType::F32)), make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                             ActivationFunctionsSmallDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLGEMMConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                           make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                           make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                       make("Bias", TensorShape(2U))),
+                                                                               make("Output", TensorShape(11U, 25U, 2U))),
+                                                                       make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                               make("Dilation", Size2D(1, 1))),
+                                                       make("ReshapeWeights", { true })),
+                                               make("DataType", DataType::F32)),
+                                       make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                               ActivationFunctionsSmallDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallWithPadding, CLConvolutionValidationWithPaddingFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerPrePaddingDataset(),
+                                                               make("ReshapeWeights", { true })),
+                                                       make("DataType", DataType::F32)),
+                                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       make("ActivationInfo", { ActivationLayerInfo() })),
+make("PrePadLayer", { PaddingList({ { 1, 1 }, { 1, 1 } }) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 template <typename T>
 using CLGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
 template <typename T>
+using CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture = ConvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T, true>;
+template <typename T>
 using CLGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T, int8_t>;
 
-const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
-{
-    ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
-});
-const auto QuantizedActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
-{
-    ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
-});
-
 TEST_SUITE(Quantized)
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(0.5f, 10),
     QuantizationInfo(0.3f, 3),
     QuantizationInfo(1.1f, 10),
 });
+
+/// @note: Every asymmetric quantized test has a version with or without activation because the quantization info given
+/// is ignored when there is no activation. Instead of using the same quantization information for all the tensors, the
+/// fixture generates separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, these two versions should be merged
+/// again, with the explicitly specified quantization info removed
+const auto NoActivation = make("ActivationInfo", ActivationLayerInfo());
+
+const auto IgnoredQuantizationInfo = make("IgnoredQuantizationInfo", QuantizationInfo());
+
+const auto QuantizedActivationFunctionsSmallDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+
 TEST_SUITE(QASYMM8)
 
 FIXTURE_DATA_TEST_CASE(RunSmallCases, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(SmallConvolutionLayerDatasetCases(),
-                                                               framework::dataset::make("ReshapeWeights", { true })),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       QuantizationData),
-                               QuantizedActivationFunctionsSmallDataset))
+    combine(SmallConvolutionLayerDatasetCases(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        IgnoredQuantizationInfo,
+        NoActivation))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallCasesWithActivation, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(SmallConvolutionLayerDatasetCases(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        QuantizationData,
+        QuantizedActivationFunctionsSmallDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                               framework::dataset::make("ReshapeWeights", { true })),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       QuantizationData),
-                               QuantizedActivationFunctionsSmallDataset))
+    combine(datasets::SmallConvolutionLayerDataset(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        IgnoredQuantizationInfo,
+        NoActivation))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(datasets::SmallConvolutionLayerDataset(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        QuantizationData,
+        QuantizedActivationFunctionsSmallDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        make("Input", TensorShape(23U, 27U, 5U)),
+        make("Weights", TensorShape(3U, 3U, 5U, 2U)),
+        make("Bias", TensorShape(2U)),
+        make("Output", TensorShape(11U, 25U, 2U)),
+        make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0)),
+        make("Dilation", Size2D(1, 1)),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        IgnoredQuantizationInfo,
+        NoActivation))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        make("Input", TensorShape(23U, 27U, 5U)),
+        make("Weights", TensorShape(3U, 3U, 5U, 2U)),
+        make("Bias", TensorShape(2U)),
+        make("Output", TensorShape(11U, 25U, 2U)),
+        make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0)),
+        make("Dilation", Size2D(1, 1)),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        QuantizationData,
+        QuantizedActivationFunctionsSmallDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                               framework::dataset::make("ReshapeWeights", { true })),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       QuantizationData),
-                               QuantizedActivationFunctionsSmallDataset))
+    combine(datasets::SmallConvolutionLayerDataset(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        IgnoredQuantizationInfo,
+        NoActivation))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLGEMMConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(datasets::SmallConvolutionLayerDataset(),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        QuantizationData,
+        QuantizedActivationFunctionsSmallDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        make("Input", TensorShape(23U, 27U, 5U)),
+        make("Weights", TensorShape(3U, 3U, 5U, 2U)),
+        make("Bias", TensorShape(2U)),
+        make("Output", TensorShape(11U, 25U, 2U)),
+        make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0)),
+        make("Dilation", Size2D(1, 1)),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        IgnoredQuantizationInfo,
+        NoActivation))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        make("Input", TensorShape(23U, 27U, 5U)),
+        make("Weights", TensorShape(3U, 3U, 5U, 2U)),
+        make("Bias", TensorShape(2U)),
+        make("Output", TensorShape(11U, 25U, 2U)),
+        make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0)),
+        make("Dilation", Size2D(1, 1)),
+        make("ReshapeWeights", { true }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        QuantizationData,
+        QuantizedActivationFunctionsSmallDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE(QSYMM8_PER_CHANNEL)
+const auto QuantizedActivationFunctionsSmallPerChannelDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+
 
 FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLGEMMConvolutionLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                                               framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
-                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                       make("ReshapeWeights", { true })),
+                                                               make("DataType", { DataType::QASYMM8_SIGNED })),
+                                                       make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                QuantizationData),
-                                       QuantizedActivationFunctionsSmallDataset),
-                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+                                       QuantizedActivationFunctionsSmallPerChannelDataset),
+                               make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -299,12 +443,12 @@ FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLGEMMConvolutionLayerQuantizedPerChannel
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                                               framework::dataset::make("DataType", { DataType::QASYMM8 })),
-                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                       make("ReshapeWeights", { true })),
+                                                               make("DataType", { DataType::QASYMM8 })),
+                                                       make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                QuantizationData),
-                                       QuantizedActivationFunctionsSmallDataset),
-                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+                                       QuantizedActivationFunctionsSmallPerChannelDataset),
+                               make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -323,9 +467,7 @@ TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallGroupedConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                   make("ReshapeWeights", { true })), make("DataType", DataType::F32)), make("DataLayout", { DataLayout::NCHW })),
                                                                                                                    ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -334,9 +476,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<float>, fr
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMGroupedConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(combine(datasets::LargeGroupedConvolutionLayerDataset(),
-                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                               framework::dataset::make("DataType", DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                       make("ReshapeWeights", { true })),
+                                               make("DataType", DataType::F32)),
+                                       make("DataLayout", { DataLayout::NCHW })),
                                ActivationFunctionsDataset))
 {
     // Validate output
@@ -347,9 +489,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallGroupedConvolutionLayerDataset(),
-                                                                                                                  framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                  make("ReshapeWeights", { true })), make("DataType", DataType::F16)), make("DataLayout", { DataLayout::NCHW })),
                                                                                                                   ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -358,9 +498,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<half>, fra
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMGroupedConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(combine(datasets::LargeGroupedConvolutionLayerDataset(),
-                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                               framework::dataset::make("DataType", DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                       make("ReshapeWeights", { true })),
+                                               make("DataType", DataType::F16)),
+                                       make("DataLayout", { DataLayout::NCHW })),
                                ActivationFunctionsDataset))
 {
     // Validate output
diff --git a/tests/validation/CL/DeconvolutionLayer.cpp b/tests/validation/CL/DeconvolutionLayer.cpp
index c284cdcee3..d1508fd902 100644
--- a/tests/validation/CL/DeconvolutionLayer.cpp
+++ b/tests/validation/CL/DeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,27 +53,29 @@ const auto data9x9_small_asymm = framework::dataset::make("InputShape", TensorSh
                                  *framework::dataset::make("PadLeft", 3)
                                  *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
 
-const auto data9x9_large_asymm = framework::dataset::make("InputShape", TensorShape{ 640U, 360U, 56U, 1U }) *framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY",
-                                 2)
-                                 *framework::dataset::make("PadLeft", 3)
-                                 *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
-
-const auto data4x4 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 3)
-                     * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("NumKernels", { 3 });
-
-const auto data3x3 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 2)
+const auto data4x4 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4, 2) * framework::dataset::make("StrideY", 2, 4) * framework::dataset::make("PadX", 1, 3)
                      * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
 
+const auto data3x3 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 2, 4) * framework::dataset::make("StrideY", 1, 4, 2) * framework::dataset::make("PadX", 1, 2)
+                     * framework::dataset::make("PadY", 1, 3) * framework::dataset::make("NumKernels", { 3 });
+
 const auto data3x3_asymm = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadLeft", 0, 1)
                            * framework::dataset::make("PadRight", 0, 1) * framework::dataset::make("PadTop", 0, 1) * framework::dataset::make("PadBottom", 0, 1) * framework::dataset::make("NumKernels", { 3 });
 
 const auto data3x3_precommit = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadX", 0, 2)
                                * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
 
+const auto data3x3_precommit_large_channels = datasets::SmallDeconvolutionShapesWithLargerChannels() * framework::dataset::make("StrideX", 2) * framework::dataset::make("StrideY", 2)
+                                              * framework::dataset::make("PadX", 1)
+                                              * framework::dataset::make("PadY", 2) * framework::dataset::make("NumKernels", { 5 });
+
 const auto data2x2_precommit = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 2) * framework::dataset::make("StrideY", 2) * framework::dataset::make("PadX", 1)
                                * framework::dataset::make("PadY", 1) * framework::dataset::make("NumKernels", { 3 });
 
-const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
+const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4, 2) * framework::dataset::make("StrideY", 2, 4) * framework::dataset::make("PadX", 0, 1)
+                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels", { 3 });
+
+const auto data5x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 2, 4) * framework::dataset::make("StrideY", 1, 4, 2) * framework::dataset::make("PadX", 0, 1)
                      * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels", { 3 });
 
 const auto data_layouts_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
@@ -87,42 +89,73 @@ TEST_SUITE(DeconvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),   // Mismatching data type
-                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),   // Invalid weights shape
-                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F16),   // Non supported data type
-                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),  // Invalid bias shape
-                                            TensorInfo(TensorShape(13U, 11U, 4U, 3U), 1, DataType::F32), // Window shrink
-                                            TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::F32),
-                                          }),
+    framework::dataset::make("InputInfo",   { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),   // Mismatching data type
+                                              TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),   // Invalid weights shape
+                                              TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F16),   // Non supported data type
+                                              TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),   // Invalid bias shape
+                                              TensorInfo(TensorShape(13U, 11U, 4U, 3U), 1, DataType::F32), // Window shrink
+                                              TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(2U, 13U, 27U), 1, DataType::F32, DataLayout::NHWC),   // Mismatching data type
+                                              TensorInfo(TensorShape(2U, 13U, 27U), 1, DataType::F32, DataLayout::NHWC),   // Invalid weights shape
+                                              TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F16, DataLayout::NHWC),   // Non supported data type
+                                              TensorInfo(TensorShape(2U, 13U, 27U), 1, DataType::F32, DataLayout::NHWC),   // Invalid bias shape
+                                              TensorInfo(TensorShape(4U, 11U, 13U, 3U), 1, DataType::F32, DataLayout::NHWC), // Window shrink
+                                              TensorInfo(TensorShape(2U, 16U, 32U), 1, DataType::F32, DataLayout::NHWC),
+                                            }),
     framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
-                                            TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
-                                            TensorInfo(TensorShape(3U, 2U, 2U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(3U, 3U, 4U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
+                                              TensorInfo(TensorShape(3U, 2U, 2U, 2U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(3U, 3U, 4U), 1, DataType::F32),
                                               TensorInfo(TensorShape(1U, 1U, 2U, 4U), 1, DataType::F32),
-                                          })),
-    framework::dataset::make("BiasInfo",  { TensorInfo(TensorShape(1U), 1, DataType::F16),
-                                            TensorInfo(TensorShape(1U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(1U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(25U, 11U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(1U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(4U), 1, DataType::F32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
-                                            TensorInfo(TensorShape(25U, 10U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(13U, 13U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(11U, 9U, 1U, 3U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(32U, 16U, 4U), 1, DataType::F32),
-                                          })),
+                                              TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F16, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(2U, 3U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(2U, 2U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(4U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(2U, 2U, 2U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                            })),
+    framework::dataset::make("BiasInfo",    { TensorInfo(TensorShape(1U), 1, DataType::F16),
+                                              TensorInfo(TensorShape(1U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(1U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(25U, 11U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(1U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(1U), 1, DataType::F16, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(1U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(1U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(25U, 11U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(1U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                            })),
+    framework::dataset::make("OutputInfo",  { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
+                                              TensorInfo(TensorShape(25U, 10U, 2U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(13U, 13U, 2U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(11U, 9U, 1U, 3U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(32U, 16U, 4U), 1, DataType::F32),
+                                              TensorInfo(TensorShape(2U, 11U, 25U), 1, DataType::F16, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(2U, 10U, 25U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(2U, 13U, 13U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(1U, 9U, 11U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                              TensorInfo(TensorShape(4U, 43U, 91U), 1, DataType::F32, DataLayout::NHWC),
+                                            })),
     framework::dataset::make("PadStrideInfo", { PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 1, 1),
                                                 PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 1, 1),
+                                                PadStrideInfo(3, 3, 2, 2),
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, false, false, true,            // NCHW
+                                           false, false, false, false, false, true })),        // NHWC
     input_info, weights_info, bias_info, output_info, pad_info, expected)
 {
     bool is_valid = bool(CLDeconvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pad_info));
@@ -149,6 +182,9 @@ using CLDeconvolutionLayerFixture1x1 = DeconvolutionValidationFixture<CLTensor,
 template <typename T>
 using CLDeconvolutionLayerAsymmFixture9x9 = DeconvolutionValidationAsymmFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 9, 9>;
 
+template <typename T>
+using CLDeconvolutionLayerFixture5x1 = DeconvolutionValidationFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 5, 1>;
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
@@ -171,6 +207,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerFixture3x3<float>, framewor
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallWithLargeChannels, CLDeconvolutionLayerFixture3x3<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data3x3_precommit_large_channels,
+                       framework::dataset::make("DataType",
+                                                DataType::F32)),
+                       data_layouts_dataset),
+                       framework::dataset::make("AddBias", { true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_fp32);
+}
+
 FIXTURE_DATA_TEST_CASE(RunAsymm, CLDeconvolutionLayerAsymmFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data3x3_asymm, framework::dataset::make("DataType",
                                                                                                                       DataType::F32)),
                                                                                                                       data_layouts_dataset),
@@ -180,8 +227,8 @@ FIXTURE_DATA_TEST_CASE(RunAsymm, CLDeconvolutionLayerAsymmFixture3x3<float>, fra
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDeconvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data3x3, framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                 data_layouts_dataset),
-                                                                                                                 add_bias_dataset))
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                 framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -202,7 +249,7 @@ TEST_SUITE_END() // W2x2
 TEST_SUITE(W1x1)
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture1x1<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data1x1, framework::dataset::make("DataType", DataType::F32)),
                                                                                                                     data_layouts_dataset),
-                                                                                                            add_bias_dataset))
+                                                                                                            framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -218,6 +265,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerAsymmFixture9x9<float>, fra
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
 TEST_SUITE_END() // W9x9
+
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture5x1<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data5x1, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                    data_layouts_dataset),
+                                                                                                            framework::dataset::make("AddBias", { true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END() // W5x1
+
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
@@ -225,7 +283,7 @@ TEST_SUITE(FP16)
 TEST_SUITE(W4x4)
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture4x4<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data4x4, framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    data_layouts_dataset),
-                                                                                                           add_bias_dataset))
+                                                                                                           framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -242,8 +300,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerFixture3x3<half>, framework
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDeconvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data3x3, framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                        data_layouts_dataset),
-                                                                                                                add_bias_dataset))
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -264,13 +322,23 @@ TEST_SUITE_END() // W2x2
 TEST_SUITE(W1x1)
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture1x1<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data1x1, framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    data_layouts_dataset),
-                                                                                                           add_bias_dataset))
+                                                                                                           framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 TEST_SUITE_END() // W1x1
 
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerFixture5x1<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data5x1, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                   data_layouts_dataset),
+                                                                                                           framework::dataset::make("AddBias", { true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END() // W5x1
+
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
@@ -286,6 +354,24 @@ using CLDeconvolutionLayerQuantizedFixture2x2 = DeconvolutionValidationQuantized
 template <typename T>
 using CLDeconvolutionLayerQuantizedFixture1x1 = DeconvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 1, 1>;
 
+template <typename T>
+using CLDeconvolutionLayerQuantizedFixture5x1 = DeconvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 5, 1>;
+
+template <typename T>
+using CLDeconvolutionLayerQuantizedPerChannelFixture4x4 = DeconvolutionValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, int8_t, 4, 4>;
+
+template <typename T>
+using CLDeconvolutionLayerQuantizedPerChannelFixture3x3 = DeconvolutionValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, int8_t, 3, 3>;
+
+template <typename T>
+using CLDeconvolutionLayerQuantizedPerChannelFixture2x2 = DeconvolutionValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, int8_t, 2, 2>;
+
+template <typename T>
+using CLDeconvolutionLayerQuantizedPerChannelFixture1x1 = DeconvolutionValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, int8_t, 1, 1>;
+
+template <typename T>
+using CLDeconvolutionLayerQuantizedPerChannelFixture5x1 = DeconvolutionValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, int8_t, 5, 1>;
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 
@@ -295,7 +381,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture4x4<uint8_t>, fr
                                                                                                                        data_layouts_dataset),
                                                                                                                        framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10), QuantizationInfo(2.f / 255.f, 5) })),
                                                                                                                        framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 5), QuantizationInfo(4.f / 255.f, 10) })),
-                                                                                                                       add_bias_dataset))
+                                                                                                                       framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -315,12 +401,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedFixture3x3<uint8_t
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDeconvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data3x3,
-                       framework::dataset::make("DataType",
-                                                DataType::QASYMM8)),
-                       data_layouts_dataset),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                        framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10), QuantizationInfo(2.f / 255.f, 128) })),
                        framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 128), QuantizationInfo(4.f / 255.f, 128) })),
-                       add_bias_dataset))
+                       framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -346,13 +431,26 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture1x1<uint8_t>, fr
                                                                                                                        data_layouts_dataset),
                                                                                                                        framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })),
                                                                                                                        framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })),
-                                                                                                                       add_bias_dataset))
+                                                                                                                       framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
 }
 TEST_SUITE_END() // W1x1
 
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture5x1<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data5x1, framework::dataset::make("DataType",
+                                                                                                                       DataType::QASYMM8)),
+                                                                                                                       data_layouts_dataset),
+                                                                                                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10), QuantizationInfo(2.f / 255.f, 5) })),
+                                                                                                                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 5), QuantizationInfo(4.f / 255.f, 10) })),
+                                                                                                                       framework::dataset::make("AddBias", { true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+TEST_SUITE_END() // W5x1
+
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
@@ -366,7 +464,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture4x4<int8_t>, fra
                                                                                                                       data_layouts_dataset),
                                                                                                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10), QuantizationInfo(2.f / 255.f, 5) })),
                                                                                                                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 5), QuantizationInfo(4.f / 255.f, 10) })),
-                                                                                                                      add_bias_dataset))
+                                                                                                                      framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -388,12 +486,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedFixture3x3<int8_t>
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDeconvolutionLayerQuantizedFixture3x3<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data3x3,
-                       framework::dataset::make("DataType",
-                                                DataType::QASYMM8_SIGNED)),
-                       data_layouts_dataset),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                        framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, -10), QuantizationInfo(2.f / 255.f, 127) })),
                        framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 64), QuantizationInfo(4.f / 255.f, -128) })),
-                       add_bias_dataset))
+                       framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
@@ -414,20 +511,192 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedFixture2x2<int8_t>
 TEST_SUITE_END() // W2x2
 
 TEST_SUITE(W1x1) // DirectDeconvolution and GEMMDeconvolution
-FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture1x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1, framework::dataset::make("DataType",
-                                                                                                                      DataType::QASYMM8_SIGNED)),
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture1x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1,
+                                                                                                                      framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                       data_layouts_dataset),
                                                                                                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })),
                                                                                                                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })),
-                                                                                                                      add_bias_dataset))
+                                                                                                                      framework::dataset::make("AddBias", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
 }
 TEST_SUITE_END() // W1x1
 
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture5x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data5x1, framework::dataset::make("DataType",
+                                                                                                                      DataType::QASYMM8_SIGNED)),
+                                                                                                                      data_layouts_dataset),
+                                                                                                                      framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10), QuantizationInfo(2.f / 255.f, 5) })),
+                                                                                                                      framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 5), QuantizationInfo(4.f / 255.f, 10) })),
+                                                                                                                      framework::dataset::make("AddBias", { true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+TEST_SUITE_END() // W5x1
+
 TEST_SUITE_END() // QASYMM8_SIGNED
 
+const auto input_qinfo_dataset         = framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 10) });
+const auto output_qinfo_dataset        = framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0) });
+const auto input_signed_qinfo_dataset  = framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, -10) });
+const auto output_signed_qinfo_dataset = framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 10) });
+
+TEST_SUITE(QSYMM8_PER_CHANNEL)
+
+TEST_SUITE(W4x4)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedPerChannelFixture4x4<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(data4x4,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_dataset),
+                       output_qinfo_dataset),
+                       framework::dataset::make("AddBias", { true })),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLDeconvolutionLayerQuantizedPerChannelFixture4x4<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(data4x4,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_dataset),
+                       output_signed_qinfo_dataset),
+                       framework::dataset::make("AddBias", { true })),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+TEST_SUITE_END() // W4x4
+
+TEST_SUITE(W3x3)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedPerChannelFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(data3x3,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_dataset),
+                       output_qinfo_dataset),
+                       framework::dataset::make("AddBias", { true })),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLDeconvolutionLayerQuantizedPerChannelFixture3x3<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(data3x3,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_dataset),
+                       output_signed_qinfo_dataset),
+                       framework::dataset::make("AddBias", { true })),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallSignedPrecommit, CLDeconvolutionLayerQuantizedPerChannelFixture2x2<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(data3x3_precommit,
+                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                               data_layouts_dataset),
+                                                       input_signed_qinfo_dataset),
+                                               output_signed_qinfo_dataset),
+                                       add_bias_dataset),
+                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+TEST_SUITE_END() // W3x3
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedPerChannelFixture2x2<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(data3x3_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_dataset),
+                       output_qinfo_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+
+TEST_SUITE(W2x2)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedPerChannelFixture2x2<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(data2x2_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_dataset),
+                       output_qinfo_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLDeconvolutionLayerQuantizedPerChannelFixture2x2<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(data2x2_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_dataset),
+                       output_signed_qinfo_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+TEST_SUITE_END() // W2x2
+
+TEST_SUITE(W1x1)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedPerChannelFixture1x1<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(data1x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_dataset),
+                       output_qinfo_dataset),
+                       framework::dataset::make("AddBias", { false })),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLDeconvolutionLayerQuantizedPerChannelFixture1x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(data1x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_dataset),
+                       output_signed_qinfo_dataset),
+                       framework::dataset::make("AddBias", { true })),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+TEST_SUITE_END() // W1x1
+
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedPerChannelFixture5x1<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(data5x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_dataset),
+                       output_qinfo_dataset),
+                       framework::dataset::make("AddBias", { true })),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLDeconvolutionLayerQuantizedPerChannelFixture5x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(data5x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_dataset),
+                       output_signed_qinfo_dataset),
+                       framework::dataset::make("AddBias", { false })),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8, tolerance_num);
+}
+TEST_SUITE_END() // W5x1
+
+TEST_SUITE_END() // QSYMM8_PER_CHANNEL
+
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // DeconvolutionLayer
diff --git a/tests/validation/CL/DepthConvertLayer.cpp b/tests/validation/CL/DepthConvertLayer.cpp
index a823b278fc..490b38ccf6 100644
--- a/tests/validation/CL/DepthConvertLayer.cpp
+++ b/tests/validation/CL/DepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,17 +44,16 @@ namespace validation
 namespace
 {
 /** Input data sets **/
-const auto DepthConvertLayerU8toU16Dataset        = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U16));
-const auto DepthConvertLayerU8toS16Dataset        = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S16));
-const auto DepthConvertLayerU8toS32Dataset        = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S32));
-const auto DepthConvertLayerU16toU8Dataset        = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U8));
-const auto DepthConvertLayerU16toU32Dataset       = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U32));
-const auto DepthConvertLayerS16toU8Dataset        = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U8));
-const auto DepthConvertLayerS16toS32Dataset       = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S32));
-const auto DepthConvertLayerF16toF32Dataset       = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32));
-const auto DepthConvertLayerF32toF16Dataset       = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
-const auto DepthConvertLayerShiftDatasetNightly   = framework::dataset::make("Shift", 0, 7);
-const auto DepthConvertLayerShiftDatasetPrecommit = framework::dataset::make("Shift", { 0, 3, 6 });
+const auto DepthConvertLayerU8toU16Dataset   = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U16));
+const auto DepthConvertLayerU8toS16Dataset   = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S16));
+const auto DepthConvertLayerU8toS32Dataset   = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S32));
+const auto DepthConvertLayerU16toU8Dataset   = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U8));
+const auto DepthConvertLayerU16toU32Dataset  = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U32));
+const auto DepthConvertLayerS16toU8Dataset   = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U8));
+const auto DepthConvertLayerS16toS32Dataset  = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S32));
+const auto DepthConvertLayerF16toF32Dataset  = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32));
+const auto DepthConvertLayerF32toF16Dataset  = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
+const auto DepthConvertLayerZeroShiftDataset = framework::dataset::make("Shift", 0);
 } // namespace
 
 TEST_SUITE(CL)
@@ -63,7 +62,7 @@ TEST_SUITE(DepthConvertLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8), // Invalid data type combination
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8), // Support upcasting from QASYMM8 to S16
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Mismatching shapes
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Invalid shift
@@ -84,8 +83,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                    ConvertPolicy::WRAP,
                                                    ConvertPolicy::WRAP,
                                                      })),
-               framework::dataset::make("Shift",{ 1, 1, 8, 1, 1, 1, })),
-               framework::dataset::make("Expected", { false, false, false, false, false, true})),
+               framework::dataset::make("Shift",{ 0, 0, 0, 1, 1, 0, })),
+               framework::dataset::make("Expected", { true, false, false, false, false, true})),
                input_info, output_info, policy, shift, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLDepthConvertLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), policy, shift)) == expected, framework::LogLevel::ERRORS);
@@ -111,7 +110,7 @@ using CLDepthConvertLayerToF32Fixture = DepthConvertLayerValidationFixture<CLTen
 TEST_SUITE(U8_to_U16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToU16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toU16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -119,7 +118,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToU16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToU16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toU16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -129,7 +128,7 @@ TEST_SUITE_END() // U8_to_U16
 TEST_SUITE(U8_to_S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToS16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toS16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -137,7 +136,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToS16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToS16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toS16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -146,7 +145,7 @@ TEST_SUITE_END() // U8_to_S16
 TEST_SUITE(U8_to_S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToS32Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toS32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -154,7 +153,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToS32Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToS32Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toS32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -164,14 +163,14 @@ TEST_SUITE_END() // U8_to_S32
 TEST_SUITE(U16_to_U8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToU8Fixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU16toU8Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToU8Fixture<uint16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU16toU8Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -181,14 +180,14 @@ TEST_SUITE_END() // U16_to_U8
 TEST_SUITE(U16_to_U32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToU32Fixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU16toU32Dataset),
                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                       DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                       DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToU32Fixture<uint16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU16toU32Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerShiftDatasetNightly))
+                                                                                                                     DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -198,14 +197,14 @@ TEST_SUITE_END() // U16_to_U32
 TEST_SUITE(S16_to_U8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToU8Fixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerS16toU8Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                     DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToU8Fixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerS16toU8Dataset),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                   DepthConvertLayerShiftDatasetNightly))
+                                                                                                                   DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -215,14 +214,14 @@ TEST_SUITE_END() // S16_to_U8
 TEST_SUITE(S16_to_S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToS32Fixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerS16toS32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToS32Fixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerS16toS32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index b2009c26ad..d4dbcec9d9 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,6 +41,9 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
+
 namespace
 {
 RelativeTolerance<half_float::half>  tolerance_f16(half_float::half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
@@ -48,16 +51,47 @@ constexpr RelativeTolerance<float>   tolerance_f32(0.01f);                  /**<
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);                  /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
 constexpr float                      tolerance_num = 0.05f;                 /**< Tolerance number */
 
-const auto depth_multipliers       = framework::dataset::make("DepthMultiplier", { 1, 2, 5 });
-const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5, 8 });
+const auto depth_multipliers       = make("DepthMultiplier", { 1, 4 });
+const auto large_depth_multipliers = make("DepthMultiplier", { 2, 5, 8 });
 
-//Activation Functions
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+// Activation Functions
+const auto NoActivation = make("ActivationInfo", ActivationLayerInfo());
+
+const auto ActivationFunctionsSmallDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 2.f, 0.f)
+});
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.8f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU)
+});
+
+const auto ActivationFunctionsQuantizedSmallDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 2.f, 0.f)
+});
+
+const auto ActivationFunctionsQuantizedDataset = make("ActivationInfo",
+{
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f, 0.f)
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 2.3f, -1.5f),
 });
+
+const auto IgnoredQuantizationInfo = make("IgnoredQuantizationInfo", QuantizationInfo());
+
 } // namespace
 
 TEST_SUITE(CL)
@@ -65,85 +99,85 @@ TEST_SUITE(DepthwiseConvolutionLayer)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching data type input/weights
-                                                        TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),    // Mismatching input feature maps
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching depth multiplier
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases size
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases dimensions
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid output size
-                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),    // patch size bigger than input width
-                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),    // dilation < 1
-                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),
-                                                        TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::QASYMM8),
-                                                      }),
-                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
-                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8),
-                                                        })),
-                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(4U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(2U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(24U), 1, DataType::S32),
-                                                       })),
-                framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::QASYMM8),
-                                                       })),
-                framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 1, 0),
-                                                      })),
-                framework::dataset::make("DepthMultiplier", { 1,
-                                                              1,
-                                                              3,
-                                                              1,
-                                                              1,
-                                                              1,
-                                                              2,
-                                                              2,
-                                                              2,
-                                                              3,
-                                                             })),
-                framework::dataset::make("Dilation", { Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(20U, 1U),
-                                                              Size2D(0U, 1U),
-                                                              Size2D(1U, 1U),
-                                                              Size2D(1U, 1U),
-                                                             })),
-                framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, true, true })),
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+                make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching data type input/weights
+                                TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),    // Mismatching input feature maps
+                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching depth multiplier
+                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases size
+                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases dimensions
+                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid output size
+                                TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),    // patch size bigger than input width
+                                TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),    // dilation < 1
+                                TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),
+                                TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::QASYMM8),
+                                }),
+                make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
+                                TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8),
+                        }),
+                make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                TensorInfo(TensorShape(2U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(24U), 1, DataType::S32),
+                        }),
+                make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
+                                TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::QASYMM8),
+                        }),
+                make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 0, 0),
+                                PadStrideInfo(1, 1, 1, 0),
+                                }),
+                make("DepthMultiplier", { 1,
+                                        1,
+                                        3,
+                                        1,
+                                        1,
+                                        1,
+                                        2,
+                                        2,
+                                        2,
+                                        3,
+                                }),
+                make("Dilation", { Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(20U, 1U),
+                                Size2D(0U, 1U),
+                                Size2D(1U, 1U),
+                                Size2D(1U, 1U),
+                                }),
+                make("Expected", { false, false, false, false, false, false, false, false, true, true })),
                 input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier, dilation, expected)
 {
     bool is_valid = bool(CLDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(true), &weights_info.clone()->set_is_resizable(true), &biases_info.clone()->set_is_resizable(true), &output_info.clone()->set_is_resizable(true), conv_info, depth_multiplier,ActivationLayerInfo(), dilation));
@@ -154,48 +188,34 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
 
 template <typename T>
 using CLDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T>;
+template <typename T>
+using CLDepthwiseConvolutionLayerMixedDataLayoutFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T, true>;
+template <typename T>
+using CLDepthwiseConvolutionLayerInPlaceFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T, false, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 TEST_SUITE(W3x3)
 TEST_SUITE(NCHW)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL,
-                           combine(combine(combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                      datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
-                                                           depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F16)),
-                                           framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                                   ActivationFunctionsDataset))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                                        large_depth_multipliers),
-                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                                DataType::F16)),
-                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                                                                                                                        ActivationFunctionsDataset))
+    combine(
+        framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                        datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
+        depth_multipliers,
+        make("DataType", DataType::F16),
+        make("DataLayout", DataLayout::NCHW),
+        ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                                                                    depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                                                                                                    ActivationFunctionsDataset))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                           large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::F16),
+        make("DataLayout", { DataLayout::NCHW }),
+        ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
@@ -204,41 +224,42 @@ TEST_SUITE_END() // NCHW
 
 TEST_SUITE(NHWC)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL,
-                           combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                           depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F16)),
-                                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                   ActivationFunctionsDataset))
+    combine(
+        datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::F16),
+        make("DataLayout", DataLayout::NHWC),
+        ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                                        large_depth_multipliers),
-                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                                DataType::F16)),
-                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                                                                                                        ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeDepthwiseConvolutionLayerDataset3x3Fp16Subset(),
+        large_depth_multipliers,
+        make("DataType", DataType::F16),
+        make("DataLayout", DataLayout::NHWC),
+        make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                                                                     depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                    make("DataType",
                                                                                                                             DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                                    ActivationFunctionsDataset))
+                                                                                                                    make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                   ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3Fp16Subset(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
@@ -249,19 +270,33 @@ TEST_SUITE_END() // W3x3
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                                                                     depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                    make("DataType",
                                                                                                                             DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    ActivationFunctionsDataset))
+                                                                                                                    make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                   ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDatasetFp16Subset(),
                                                                                                                         large_depth_multipliers),
-                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                        make("DataType",
                                                                                                                                 DataType::F16)),
-                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                        ActivationFunctionsDataset))
+                                                                                                                        make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 2 }),
+        make("DataType", DataType::F16),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
@@ -269,25 +304,38 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, f
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                                                     depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                    make("DataType",
                                                                                                                             DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    ActivationFunctionsDataset))
+                                                                                                                    make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                   ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDatasetFp16Subset(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
+
+TEST_SUITE(InPlace)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerInPlaceFixture<half>, framework::DatasetMode::ALL,
+                           combine(combine(combine(combine(datasets::SmallInPlaceDepthwiseConvolutionLayerDataset(),
+                                                           make("DepthMultiplier", { 1 })),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
+{
+    validate(CLAccessor(_src), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END() // InPlace
 TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
@@ -297,84 +345,76 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>,
                            combine(combine(combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                       datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                                   ActivationFunctionsDataset))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                           large_depth_multipliers),
-                           framework::dataset::make("DataType",
-                                                    DataType::F32)),
-                           framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                           ActivationFunctionsDataset))
+                                           make("DataLayout", DataLayout::NCHW)),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", DataLayout::NCHW)),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                           large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NCHW)),
-                                   ActivationFunctionsDataset))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // NCHW
+
 TEST_SUITE(NHWC)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
                            combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", DataLayout::NHWC)),
+                                  ActivationFunctionsSmallDataset))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, CLDepthwiseConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", { 2 })),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", DataLayout::NHWC)),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                            large_depth_multipliers),
-                           framework::dataset::make("DataType",
+                           make("DataType",
                                                     DataType::F32)),
-                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                           ActivationFunctionsDataset))
+                           make("DataLayout", DataLayout::NHWC)),
+                           make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+
 TEST_SUITE(Dilation)
 
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", DataLayout::NHWC)),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", DataLayout::NHWC)),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", DataLayout::NHWC)),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -385,19 +425,45 @@ TEST_SUITE_END() // W3x3
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                                                                      depth_multipliers),
-                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                     make("DataType",
                                                                                                                              DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     ActivationFunctionsDataset))
+                                                                                                                     make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                            large_depth_multipliers),
-                           framework::dataset::make("DataType",
+                           make("DataType",
                                                     DataType::F32)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                           ActivationFunctionsDataset))
+                           make("DataLayout", { DataLayout::NHWC })),
+                           make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE_NEW(RunLargeKernelSize, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
+                           combine(combine(combine(combine(datasets::LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset(),
+                                                           make("DepthMultiplier", { 1 })),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 2 }),
+        make("DataType", DataType::F32),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -405,78 +471,130 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>,
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                                                      depth_multipliers),
-                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                     make("DataType",
                                                                                                                              DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     ActivationFunctionsDataset))
+                                                                                                                     make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
+
+TEST_SUITE(InPlace)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerInPlaceFixture<float>, framework::DatasetMode::ALL,
+                           combine(combine(combine(combine(datasets::SmallInPlaceDepthwiseConvolutionLayerDataset(),
+                                                           make("DepthMultiplier", { 1 })),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
+{
+    validate(CLAccessor(_src), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // InPlace
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 template <typename T>
 using CLDepthwiseConvolutionLayerQuantizedFixture = DepthwiseConvolutionLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T>;
 template <typename T>
+using CLDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture = DepthwiseConvolutionLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T, true>;
+template <typename T>
 using CLDepthwiseConvolutionLayerQuantizedPerChannelFixture = DepthwiseConvolutionLayerValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T, int8_t>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })), // NCHW is tested with int8
-                                   ActivationFunctionsDataset))
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }), // NCHW is tested with int8
+        NoActivation))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 128), QuantizationInfo(2.2f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(1.f, 128) }),
+        make("DataLayout", { DataLayout::NHWC }), // NCHW is tested with int8
+        ActivationFunctionsQuantizedSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
-                                                                           large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 2) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 2U }),
+        make("DataType", DataType::QASYMM8),
+        make("SrcQuantizationInfo", { QuantizationInfo(2.2f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.1f, 128) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })), // NCHW is tested with int8
-                                   ActivationFunctionsDataset))
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }), // NCHW is tested with int8
+        NoActivation))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) }),
+        make("DataLayout", { DataLayout::NHWC }), // NCHW is tested with int8
+        ActivationFunctionsQuantizedSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                           large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(1.3f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -484,47 +602,80 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                           large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, CLDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", { 2 }),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                           large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -535,25 +686,73 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, CLDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NCHW }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, CLDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        make("DepthMultiplier", { 2 }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 2U }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, CLDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) }),
+        make("DataLayout", { DataLayout::NCHW }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -566,24 +765,40 @@ TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                                                                                    large_depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 2) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.7f, 2) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 2U }),
+        make("SrcDataType", DataType::QASYMM8_SIGNED),
+        make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -591,24 +806,24 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                                                    large_depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -618,24 +833,24 @@ TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                                    large_depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
@@ -643,24 +858,24 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                  ActivationFunctionsSmallDataset))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                                    large_depth_multipliers),
-                                                                           framework::dataset::make("SrcDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                           framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                                           make("SrcDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
diff --git a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
index 6b917d8962..012018c0fc 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,6 @@
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -43,6 +42,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 using namespace arm_compute::misc::shape_calculator;
 
 // Create function for CLDepthwiseConvolutionLayerNativeKernel
@@ -63,64 +63,89 @@ RelativeTolerance<half_float::half>  rel_tolerance_f16(half_float::half(0.01f));
 constexpr float                      abs_tolerance_f16(0.03f);
 
 /** Width values to test - Precommit */
-const auto width_values_precommit = framework::dataset::make("width", { 1U, 17U, 32U } );
+const auto width_values_precommit = make("width", { 1U, 33U } );
 
 /** Width values to test - Nightly */
-const auto width_values_nightly = framework::dataset::make("width", { 53U, 47U } );
+const auto width_values_nightly = make("width", { 53U, 47U } );
 
 /** Height values to test - Precommit */
-const auto height_values_precommit = framework::dataset::make("height", { 19U } );
+const auto height_values_precommit = make("height", { 19U } );
 
 /** Height values to test - Nightly */
-const auto height_values_nightly = framework::dataset::make("height", { 39U, 43U } );
+const auto height_values_nightly = make("height", { 39U, 43U } );
 
 /** Channel values to test - Precommit */
-const auto channel_values_precommit = framework::dataset::make("channels", { 15U });
+const auto channel_values_precommit = make("channels", { 15U });
 
 /** Channel values to test - Nightly */
-const auto channel_values_nightly = framework::dataset::make("channels", { 33U, 19U });
+const auto channel_values_nightly = make("channels", { 33U, 19U });
+
+/** Channel values to test with cl_image support - Precommit */
+const auto channel_values_export_to_cl_image_precommit = make("channels", { 16U });
+
+/** Channel values to test with cl_image support - Nightly */
+const auto channel_values_export_to_cl_image_nightly = make("channels", { 32U });
 
 /** Batch values to test - Precommit */
-const auto batch_values_precommit = framework::dataset::make("batch", { 1U, 2U });
+const auto batch_values_precommit = make("batch", { 1U, 2U });
 
 /** Batch values to test - Nightly */
-const auto batch_values_nightly = framework::dataset::make("batch", { 1U, 3U });
+const auto batch_values_nightly = make("batch", { 3U });
 
 /** Kernel size values to test - Precommit */
-const auto kernel_sz_values_precommit = framework::dataset::make("kernel_size", { Size2D(1U, 1U), Size2D(1U, 3U), Size2D(5U, 5U) });
+const auto kernel_sz_values_precommit = make("kernel_size", { Size2D(1U, 1U), Size2D(1U, 3U), Size2D(5U, 5U) });
 
 /** Kernel size values to test - Nightly */
-const auto kernel_sz_values_nightly = framework::dataset::make("kernel_size", { Size2D(3U, 5U), Size2D(5U, 1U), Size2D(1U, 7U), Size2D(9U, 7U) });
+const auto kernel_sz_values_nightly = make("kernel_size", { Size2D(3U, 5U), Size2D(5U, 1U), Size2D(1U, 7U), Size2D(9U, 7U) });
 
 /** Depth multiplier values to test - All */
-const auto depth_multiplier_values = framework::dataset::make("depth_multiplier", {3U});
+const auto depth_multiplier_values = make("depth_multiplier", {3U});
 
 /** Dilation values to test - All */
-const auto dilation_values = framework::dataset::make("dilation", { Size2D(1U, 1U), Size2D(3U, 3U) });
+const auto dilation_values = make("dilation", { Size2D(1U, 1U), Size2D(3U, 3U) });
 
 /** Stride values to test - All */
-const auto stride_values = framework::dataset::make("stride", { Size2D(1U, 1U), Size2D(3U, 2U) });
+const auto stride_values = make("stride", { Size2D(1U, 1U), Size2D(3U, 2U) });
 
-/** Padding values to test - All */
-const auto padding_valid_values = framework::dataset::make("padding_valid", { true, false });
+/** Padding values to test - Precommit */
+const auto padding_valid_values = make("padding_valid", { true, false });
 
-/** Data type values to test - All */
-const auto data_type_values = framework::dataset::make("data_type", { DataType::F32, DataType::F16 });
+/** Padding values to test - Nightly */
+const auto padding_valid_values_nightly = make("padding_valid", { false });
 
 /** Data layout values to test - All */
-const auto data_layout_values = framework::dataset::make("data_layout", { DataLayout::NHWC });
+const auto data_layout_values = make("data_layout", { DataLayout::NHWC });
 
 /** N0 values to test - Precommit */
-const auto n0_values_precommit = framework::dataset::make("N0", {2, 4});
+const auto n0_values_precommit = make("N0", {2, 4});
 
 /** N0 values to test - Nightly */
-const auto n0_values_nightly = framework::dataset::make("N0", {3, 8});
+const auto n0_values_nightly = make("N0", {3, 8});
+
+/** N0 values to test with cl_image support - Precommit */
+const auto n0_values_export_to_cl_image_precommit = make("N0", {4});
+
+/** N0 values to test with cl_image support - Nightly */
+const auto n0_values_export_to_cl_image_nightly = make("N0", {8});
 
-/** Activation values to test */
-const auto act_values = framework::dataset::make("Activation",
+/** Activation values to test in precommit */
+const auto act_values = make("Activation", { ActivationLayerInfo() });
+
+const auto activations_rest = make("Activation",
 {
-    ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, 2.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.8f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU)
 });
 
 } // namespace
@@ -130,94 +155,264 @@ TEST_SUITE(DepthwiseConvolutionLayerNative)
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                                                 width_values_precommit,
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
                                                                                                 kernel_sz_values_precommit),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
+                                                                                                make("depth_multiplier", 1)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                n0_values_precommit))
+                                                                                                n0_values_precommit),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::NIGHTLY,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                                                 width_values_nightly,
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
                                                                                                 batch_values_nightly),
                                                                                                 kernel_sz_values_nightly),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
+                                                                                                make("depth_multiplier", 1)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
-                                                                                                padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
-                                                                                                act_values),
-                                                                                                n0_values_nightly))
+                                                                                                make("Activation", { ActivationLayerInfo() })),
+                                                                                                n0_values_nightly),
+                                                                                                make("ExportToCLImage", false)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+    combine(
+        make("width", { 33U } ),
+        height_values_precommit,
+        channel_values_precommit,
+        make("batch", { 2U } ),
+        make("kernel_size", { Size2D(5U, 5U) }),
+        make("depth_multiplier", 1),
+        make("dilation", Size2D(3U, 3U)),
+        make("stride", Size2D(3U, 2U)),
+        padding_valid_values_nightly,
+        make("DataType", DataType::F32),
+        data_layout_values,
+        activations_rest,
+        n0_values_precommit,
+        make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
+
+TEST_SUITE(ExportWeightsToCLImage)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                width_values_precommit,
+                                                                                                height_values_precommit),
+                                                                                                channel_values_export_to_cl_image_precommit),
+                                                                                                batch_values_precommit),
+                                                                                                kernel_sz_values_precommit),
+                                                                                                make("depth_multiplier", 1)),
+                                                                                                dilation_values),
+                                                                                                stride_values),
+                                                                                                padding_valid_values),
+                                                                                                make("DataType", DataType::F32)),
+                                                                                                data_layout_values),
+                                                                                                act_values),
+                                                                                                n0_values_export_to_cl_image_precommit),
+                                                                                                make("ExportToCLImage", true)))
+{
+   // Validate output
+    if(_validate_output)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                width_values_nightly,
+                                                                                                height_values_nightly),
+                                                                                                channel_values_export_to_cl_image_nightly),
+                                                                                                batch_values_nightly),
+                                                                                                kernel_sz_values_nightly),
+                                                                                                make("depth_multiplier", 1)),
+                                                                                                dilation_values),
+                                                                                                stride_values),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F32)),
+                                                                                                data_layout_values),
+                                                                                                make("Activation", { ActivationLayerInfo() })),
+                                                                                                n0_values_export_to_cl_image_nightly),
+                                                                                                make("ExportToCLImage", true)))
+{
+   // Validate output
+    if(_validate_output)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+TEST_SUITE_END() // ExportWeightsToCLImage
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                                                 width_values_precommit,
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
                                                                                                 kernel_sz_values_precommit),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
+                                                                                                make("depth_multiplier", 1)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                n0_values_precommit))
+                                                                                                n0_values_precommit),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
         validate(CLAccessor(_target), _reference, rel_tolerance_f16);
 }
-
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::NIGHTLY,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_nightly,
-                                                                                                height_values_nightly),
-                                                                                                channel_values_nightly),
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                make("width", { 47U } ),
+                                                                                                make("height", { 39U } )),
+                                                                                                make("channels", { 19U } )),
                                                                                                 batch_values_nightly),
-                                                                                                kernel_sz_values_nightly),
-                                                                                                framework::dataset::make("depth_multiplier", 1)),
+                                                                                                make("kernel_size", { Size2D(5U, 5U) })),
+                                                                                                make("depth_multiplier", 1)),
+                                                                                                make("dilation", { Size2D(3U, 3U) })),
+                                                                                                make("stride", { Size2D(3U, 2U) })),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F16)),
+                                                                                                data_layout_values),
+                                                                                                make("Activation", { ActivationLayerInfo() })),
+                                                                                                n0_values_nightly),
+                                                                                                make("ExportToCLImage", false)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
+    combine(
+        make("width", { 33U } ),
+        height_values_precommit,
+        channel_values_precommit,
+        make("batch", { 2U } ),
+        make("kernel_size", { Size2D(5U, 5U) }),
+        make("depth_multiplier", 4),
+        make("dilation", Size2D(3U, 3U)),
+        make("stride", Size2D(3U, 2U)),
+        padding_valid_values_nightly,
+        make("DataType", DataType::F16),
+        data_layout_values,
+        activations_rest,
+        n0_values_precommit,
+        make("ExportToCLImage", false)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE(ExportWeightsToCLImage)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                width_values_precommit,
+                                                                                                height_values_precommit),
+                                                                                                channel_values_export_to_cl_image_precommit),
+                                                                                                batch_values_precommit),
+                                                                                                kernel_sz_values_precommit),
+                                                                                                make("depth_multiplier", 1)),
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                n0_values_nightly))
+                                                                                                n0_values_export_to_cl_image_precommit),
+                                                                                                make("ExportToCLImage", true)))
 {
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+   // Validate output
+    if(_validate_output)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                make("width", { 47U } ),
+                                                                                                make("height", { 39U } )),
+                                                                                                channel_values_export_to_cl_image_nightly),
+                                                                                                batch_values_nightly),
+                                                                                                make("kernel_size", { Size2D(5U, 5U) })),
+                                                                                                make("depth_multiplier", 1)),
+                                                                                                make("dilation", { Size2D(3U, 3U) })),
+                                                                                                make("stride", { Size2D(3U, 2U) })),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F16)),
+                                                                                                data_layout_values),
+                                                                                                make("Activation", { ActivationLayerInfo() })),
+                                                                                                n0_values_export_to_cl_image_nightly),
+                                                                                                make("ExportToCLImage", true)))
+{
+   // Validate output
+    if(_validate_output)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
+TEST_SUITE_END() // ExportWeightsToCLImage
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
+
 TEST_SUITE(DepthMultiplier)
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_precommit,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                make("width", { 33U } ),
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
@@ -226,18 +421,19 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<fl
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                framework::dataset::make("N0", 1)))
+                                                                                                make("N0", 1)),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::NIGHTLY,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_nightly,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                make("width", { 53U } ),
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
                                                                                                 batch_values_nightly),
@@ -245,21 +441,77 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<fl
                                                                                                 depth_multiplier_values),
                                                                                                 dilation_values),
                                                                                                 stride_values),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F32)),
+                                                                                                data_layout_values),
+                                                                                                make("Activation", { ActivationLayerInfo() })),
+                                                                                                make("N0", 1)),
+                                                                                                make("ExportToCLImage", false)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+TEST_SUITE(DepthMultiplierMultipleOfOutputChannels)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                make("width", { 33U } ),
+                                                                                                height_values_precommit),
+                                                                                                channel_values_precommit),
+                                                                                                batch_values_precommit),
+                                                                                                kernel_sz_values_precommit),
+                                                                                                make("depth_multiplier", 2)),
+                                                                                                dilation_values),
+                                                                                                stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                make("DataType", DataType::F32)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                framework::dataset::make("N0", 1)))
+                                                                                                make("N0", {2})),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
+
+TEST_SUITE(ExportWeightsToCLImage)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                make("width", { 33U } ),
+                                                                                                height_values_precommit),
+                                                                                                channel_values_precommit),
+                                                                                                batch_values_precommit),
+                                                                                                kernel_sz_values_precommit),
+                                                                                                make("depth_multiplier", 4)),
+                                                                                                dilation_values),
+                                                                                                stride_values),
+                                                                                                padding_valid_values),
+                                                                                                make("DataType", DataType::F32)),
+                                                                                                data_layout_values),
+                                                                                                act_values),
+                                                                                                make("N0", {4})),
+                                                                                                make("ExportToCLImage", true)))
+{
+   // Validate output
+    if(_validate_output)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // ExportWeightsToCLImage
+TEST_SUITE_END() // DepthMultiplierMultipleOfOutputChannels
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_precommit,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                make("width", { 33U } ),
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
                                                                                                 batch_values_precommit),
@@ -268,18 +520,19 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<ha
                                                                                                 dilation_values),
                                                                                                 stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                framework::dataset::make("N0", 1)))
+                                                                                                make("N0", 1)),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
         validate(CLAccessor(_target), _reference, rel_tolerance_f16);
 }
 
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::NIGHTLY,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                                                width_values_nightly,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                make("width", { 53U } ),
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
                                                                                                 batch_values_nightly),
@@ -287,15 +540,71 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture<ha
                                                                                                 depth_multiplier_values),
                                                                                                 dilation_values),
                                                                                                 stride_values),
+                                                                                                padding_valid_values_nightly),
+                                                                                                make("DataType", DataType::F16)),
+                                                                                                data_layout_values),
+                                                                                                make("Activation", { ActivationLayerInfo() })),
+                                                                                                make("N0", 1)),
+                                                                                                make("ExportToCLImage", false)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE(DepthMultiplierMultipleOfOutputChannels)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                make("width", { 33U } ),
+                                                                                                height_values_precommit),
+                                                                                                channel_values_precommit),
+                                                                                                batch_values_precommit),
+                                                                                                kernel_sz_values_precommit),
+                                                                                                make("depth_multiplier", 2)),
+                                                                                                dilation_values),
+                                                                                                stride_values),
                                                                                                 padding_valid_values),
-                                                                                                framework::dataset::make("DataType", DataType::F16)),
+                                                                                                make("DataType", DataType::F16)),
                                                                                                 data_layout_values),
                                                                                                 act_values),
-                                                                                                framework::dataset::make("N0", 1)))
+                                                                                                make("N0", {2})),
+                                                                                                make("ExportToCLImage", false)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
 }
+
+TEST_SUITE(ExportWeightsToCLImage)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                make("width", { 33U } ),
+                                                                                                height_values_precommit),
+                                                                                                channel_values_precommit),
+                                                                                                batch_values_precommit),
+                                                                                                kernel_sz_values_precommit),
+                                                                                                make("depth_multiplier", 4)),
+                                                                                                dilation_values),
+                                                                                                stride_values),
+                                                                                                padding_valid_values),
+                                                                                                make("DataType", DataType::F16)),
+                                                                                                data_layout_values),
+                                                                                                act_values),
+                                                                                                make("N0", {4})),
+                                                                                                make("ExportToCLImage", true)))
+{
+   // Validate output
+    if(_validate_output)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // ExportWeightsToCLImage
+TEST_SUITE_END() // DepthMultiplierMultipleOfOutputChannels
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // DepthMultiplier
diff --git a/tests/validation/CL/DilatedConvolutionLayer.cpp b/tests/validation/CL/DilatedConvolutionLayer.cpp
index 9a9df2c7e4..776bf34151 100644
--- a/tests/validation/CL/DilatedConvolutionLayer.cpp
+++ b/tests/validation/CL/DilatedConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -167,13 +167,18 @@ template <typename T>
 using CLGEMMDilatedConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
 
 TEST_SUITE(Quantized)
+/// @note: Every asymmetric quantized test where there's no fused activation will have its quantization info ignored
+/// This is because instead of using the same quantization information for all the tensors, the fixture generates
+/// separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, we can remove the explicit
+/// quantization info.
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMDilatedConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
 {
     // Validate output
@@ -185,7 +190,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMDilatedConvolutionLayerQuantizedFixture<u
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 0) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
 {
     // Validate output
diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp
index e244576daf..ff22ae5ef0 100644
--- a/tests/validation/CL/DirectConvolutionLayer.cpp
+++ b/tests/validation/CL/DirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,9 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/DirectConvolutionLayerFixture.h"
 
+/** Synced with tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
+ *  Please check there for any differences in the coverage
+ */
 namespace arm_compute
 {
 namespace test
@@ -43,10 +46,12 @@ namespace validation
 {
 namespace
 {
-RelativeTolerance<half>              tolerance_fp16(half(0.2)); /**< Tolerance for floating point tests */
-RelativeTolerance<float>             tolerance_fp32(0.05f);     /**< Tolerance for floating point tests */
-constexpr float                      tolerance_num = 0.07f;     /**< Tolerance number */
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);      /**< Tolerance for quantized tests */
+RelativeTolerance<half>  tolerance_fp16(half(0.2));  /**< Tolerance for floating point tests */
+RelativeTolerance<float> tolerance_fp32(0.05f);      /**< Tolerance for floating point tests */
+constexpr float          abs_tolerance_f32(0.0001f); /**< Absolute tolerance for FP32 tests*/
+
+constexpr float                      tolerance_num = 0.07f; /**< Tolerance number */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);  /**< Tolerance for quantized tests */
 
 const auto data_strides          = combine(framework::dataset::make("StrideX", 1, 3), framework::dataset::make("StrideY", 1, 3));
 const auto data_strides_small    = combine(framework::dataset::make("StrideX", 1), framework::dataset::make("StrideY", 1));
@@ -87,55 +92,132 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
 TEST_SUITE(CL)
 TEST_SUITE(DirectConvolutionLayer)
 
+/** Check whether the configuration of a Direct Convolution layer with no
+ * bias leads to a successful execution.
+ */
+TEST_CASE(NoBias, framework::DatasetMode::PRECOMMIT)
+{
+    const auto     src_shape     = TensorShape(27U, 13U, 2U);
+    const auto     weights_shape = TensorShape(3U, 3U, 2U, 4U);
+    const auto     bias_shape    = TensorShape(4U);
+    const auto     dst_shape     = TensorShape(25U, 11U, 4U);
+    constexpr auto dt            = DataType::F32;
+
+    auto src     = create_tensor<CLTensor>(src_shape, dt);
+    auto weights = create_tensor<CLTensor>(weights_shape, dt);
+    auto dst     = create_tensor<CLTensor>(dst_shape, dt);
+
+    const auto conv_info = PadStrideInfo(1, 1, 0, 0);
+
+    // Create Direct Convolution function
+    CLDirectConvolutionLayer conv{};
+    conv.configure(&src, &weights, nullptr, &dst, conv_info);
+
+    src.allocator()->allocate();
+    weights.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    library->fill_tensor_value(CLAccessor(src), 1.f);
+    library->fill_tensor_value(CLAccessor(weights), 1.f);
+
+    conv.run();
+
+    // Compute reference to compare
+    SimpleTensor<float> ref_src{ src_shape, dt };
+    SimpleTensor<float> ref_weights{ weights_shape, dt };
+    SimpleTensor<float> ref_bias{ bias_shape, dt };
+    library->fill_tensor_value(ref_src, 1.f);
+    library->fill_tensor_value(ref_weights, 1.f);
+    // No bias
+    library->fill_tensor_value(ref_bias, 0.f);
+    auto ref_dst = reference::convolution_layer<float>(ref_src, ref_weights, ref_bias, dst_shape, conv_info);
+
+    validate(CLAccessor(dst), ref_dst);
+}
+
+/** Check whether the case of rectangle kernels i.e. when width and height of the weight_shape are not equal
+ *  would lead to successful run
+ */
+TEST_CASE(NonSquareKernel, framework::DatasetMode::PRECOMMIT)
+{
+    auto           src_shape     = TensorShape(33U, 27U, 3U);
+    auto           weights_shape = TensorShape(5U, 7U, 3U, 4U); // non-square kernel
+    const auto     bias_shape    = TensorShape(4U);
+    auto           dst_shape     = TensorShape(11U, 12U, 4U);
+    constexpr auto dt            = DataType::F32;
+
+    TensorShape src_shape_nhwc(src_shape);
+    TensorShape weights_shape_nhwc(weights_shape);
+    TensorShape dst_shape_nhwc(dst_shape);
+
+    // Non-square shapes are only allowed for NHWC
+    permute(src_shape_nhwc, PermutationVector(2U, 0U, 1U));
+    permute(weights_shape_nhwc, PermutationVector(2U, 0U, 1U));
+    permute(dst_shape_nhwc, PermutationVector(2U, 0U, 1U));
+
+    auto       src       = create_tensor<CLTensor>(src_shape_nhwc, dt, 1, QuantizationInfo(), DataLayout::NHWC);
+    auto       weights   = create_tensor<CLTensor>(weights_shape_nhwc, dt, 1, QuantizationInfo(), DataLayout::NHWC);
+    auto       dst       = create_tensor<CLTensor>(dst_shape_nhwc, dt, 1, QuantizationInfo(), DataLayout::NHWC);
+    const auto conv_info = PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR);
+
+    // Create direct convolution function
+    CLDirectConvolutionLayer conv{};
+    conv.configure(&src, &weights, nullptr, &dst, conv_info);
+
+    src.allocator()->allocate();
+    weights.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    library->fill_tensor_value(CLAccessor(src), 1.f);
+    library->fill_tensor_value(CLAccessor(weights), 1.f);
+
+    conv.run();
+
+    // Compute reference to compare
+    SimpleTensor<float> ref_src{ src_shape, dt };
+    SimpleTensor<float> ref_weights{ weights_shape, dt };
+    SimpleTensor<float> ref_bias{ bias_shape, dt };
+    library->fill_tensor_value(ref_src, 1.f);
+    library->fill_tensor_value(ref_weights, 1.f);
+    // No bias
+    library->fill_tensor_value(ref_bias, 0.f);
+    auto ref_dst = reference::convolution_layer<float>(ref_src, ref_weights, ref_bias, dst_shape, conv_info);
+
+    validate(CLAccessor(dst), ref_dst);
+}
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching data type input/weights
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching input feature maps
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported kernel width
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Non-rectangular weights dimensions
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid: Mismatching data type input/weights
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid: Mismatching input feature maps
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid weights dimensions
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid stride
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid biases size
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid biases dimensions
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported biases size
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported biases dimensions
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid output size
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Window shrink
                                                        TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::F32),
                                                      }),
                framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F16),
                                                         TensorInfo(TensorShape(3U, 3U, 3U, 4U), 1, DataType::F32),
-                                                        TensorInfo(TensorShape(11U, 11U, 2U, 4U), 1, DataType::F32),
-                                                        TensorInfo(TensorShape(5U, 3U, 2U, 4U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(3U, 3U, 2U, 4U, 3U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
-                                                        TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
-                                                        TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(1U, 1U, 2U, 4U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("BiasesInfo",{ TensorInfo(TensorShape(4U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(4U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(4U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(3U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(4U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(4U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(4U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(26U, 11U, 4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 16U, 4U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("ConvInfo",  { PadStrideInfo(1, 1, 0, 0),
@@ -143,47 +225,95 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(3, 3, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                       })),
                        framework::dataset::make("ActivationInfo",
 {
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
 })),
-               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
+               framework::dataset::make("Expected", { false, false, false, false, false, false, true })),
                input_info, weights_info, biases_info, output_info, conv_info, act_info, expected)
 {
     bool is_valid = bool(CLDirectConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, act_info));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
+// clang-format on
+// *INDENT-ON*
 
 template <typename T>
 using CLDirectConvolutionLayerFixture = DirectConvolutionValidationFixture<CLTensor, CLAccessor, CLDirectConvolutionLayer, T>;
 template <typename T>
+using CLDirectConvolutionLayerMixedDataLayoutFixture = DirectConvolutionValidationFixture<CLTensor, CLAccessor, CLDirectConvolutionLayer, T, true>;
+template <typename T>
 using CLDirectConvolutionValidationWithTensorShapesFixture = DirectConvolutionValidationWithTensorShapesFixture<CLTensor, CLAccessor, CLDirectConvolutionLayer, T>;
 template <typename T>
 using CLDirectConvolutionLayerQuantizedFixture = DirectConvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLDirectConvolutionLayer, T>;
 template <typename T>
+using CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture = DirectConvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLDirectConvolutionLayer, T, true>;
+template <typename T>
 using CLDirectConvolutionValidationWithTensorShapesQuantizedFixture = DirectConvolutionValidationWithTensorShapesQuantizedFixture<CLTensor, CLAccessor, CLDirectConvolutionLayer, T>;
 
 TEST_SUITE(NHWC)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", {
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // Arbitrary weight sizes for NHWC are supported
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // Non-rectangular weights dimensions for NHWC are supported
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // Strides > 2 for any kernel sizes for NHWC are supported
+                                                     }),
+               framework::dataset::make("WeightsInfo",{
+                                                        TensorInfo(TensorShape(2U, 13U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 5U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 3U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                     })),
+               framework::dataset::make("BiasesInfo",{
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                     })),
+               framework::dataset::make("OutputInfo",{
+                                                       TensorInfo(TensorShape(4U, 15U, 1U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U, 23U, 11U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U, 9U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                     })),
+               framework::dataset::make("ConvInfo",  {
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(3, 3, 0, 0),
+                                                      })),
+                       framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+})),
+               framework::dataset::make("Expected", { true, true, true })),
+               input_info, weights_info, biases_info, output_info, conv_info, act_info, expected)
+{
+    bool is_valid = bool(CLDirectConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, act_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                combine(combine(combine(zip(zip(zip(zip(zip(zip(
-               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U),
-                                                        TensorShape(9U, 5U, 6U, 4U),
-                                                        TensorShape(3U, 5U, 7U, 2U),
-                                                        TensorShape(32U, 37U, 3U) } ),
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
                framework::dataset::make("StrideX", { 1, 3, 1, 1 })),
                framework::dataset::make("StrideY", { 1, 3, 2, 1 })),
                framework::dataset::make("PadX", { 1, 3, 0, 4 })),
                framework::dataset::make("PadY", { 1, 3, 0, 4 })),
                framework::dataset::make("KernelSize", { 3, 8, 1, 9 })),
-               framework::dataset::make("NumKernels", { 7, 3, 1, 3 })),
+               framework::dataset::make("NumKernels", { 17, 3, 1, 19 })),
                framework::dataset::make("DataType",  DataType::F16)),
                framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )),
                framework::dataset::make("DataLayout", DataLayout::NHWC)))
@@ -212,23 +342,40 @@ TEST_SUITE_END() // FP16
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                combine(combine(combine(zip(zip(zip(zip(zip(zip(
-               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U),
-                                                        TensorShape(9U, 5U, 6U, 4U),
-                                                        TensorShape(3U, 5U, 7U, 2U),
-                                                        TensorShape(32U, 37U, 3U) } ),
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
                framework::dataset::make("StrideX", { 1, 3, 1, 1 })),
                framework::dataset::make("StrideY", { 1, 3, 2, 1 })),
                framework::dataset::make("PadX", { 1, 3, 0, 4 })),
                framework::dataset::make("PadY", { 1, 3, 0, 4 })),
                framework::dataset::make("KernelSize", { 3, 8, 1, 9 })),
-               framework::dataset::make("NumKernels", { 7, 3, 1, 3 })),
+               framework::dataset::make("NumKernels", { 17, 3, 1, 19 })),
                framework::dataset::make("DataType",  DataType::F32)),
                framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )),
                framework::dataset::make("DataLayout", DataLayout::NHWC)))
 {
-    validate(CLAccessor(_target), _reference, tolerance_fp32);
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.0, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLDirectConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
+               combine(combine(combine(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
+               framework::dataset::make("StrideX", { 1 })),
+               framework::dataset::make("StrideY", { 2 })),
+               framework::dataset::make("PadX", { 1 })),
+               framework::dataset::make("PadY", { 3 })),
+               framework::dataset::make("KernelSize", { 3 })),
+               framework::dataset::make("NumKernels", { 3 })),
+               framework::dataset::make("DataType",  DataType::F32)),
+               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.0, abs_tolerance_f32);
 }
-
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                combine(combine(combine(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ),
@@ -242,19 +389,18 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerFixture<float>, framewo
                framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::IDENTITY) )),
                framework::dataset::make("DataLayout", DataLayout::NHWC)))
 {
-    validate(CLAccessor(_target), _reference, tolerance_fp32);
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.0, abs_tolerance_f32);
 }
-
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(
-               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U),
-                                                        TensorShape(9U, 5U, 6U, 4U),
-                                                        TensorShape(3U, 5U, 7U, 2U),
-                                                        TensorShape(32U, 37U, 3U) } ),
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
                framework::dataset::make("StrideX", { 1, 3, 1, 1 })),
                framework::dataset::make("StrideY", { 1, 3, 2, 1 })),
                framework::dataset::make("PadX", { 1, 3, 0, 4 })),
@@ -268,7 +414,25 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<uint8_
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+               combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
+               framework::dataset::make("StrideX", { 1 })),
+               framework::dataset::make("StrideY", { 2 })),
+               framework::dataset::make("PadX", { 1 })),
+               framework::dataset::make("PadY", { 1 })),
+               framework::dataset::make("KernelSize", { 3 })),
+               framework::dataset::make("NumKernels", { 3 })),
+               framework::dataset::make("DataType",  DataType::QASYMM8)),
+               framework::dataset::make("QuantizationInfo", QuantizationInfo(1.1f / 255, 10))),
+               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ),
@@ -287,14 +451,13 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture<uint8_
 }
 
 TEST_SUITE_END() // QASYMM8
-//
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(
-               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U),
-                                                        TensorShape(9U, 5U, 6U, 4U),
-                                                        TensorShape(3U, 5U, 7U, 2U),
-                                                        TensorShape(32U, 37U, 3U) } ),
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
                framework::dataset::make("StrideX", { 1, 3, 1, 1 })),
                framework::dataset::make("StrideY", { 1, 3, 2, 1 })),
                framework::dataset::make("PadX", { 1, 3, 0, 4 })),
@@ -308,7 +471,25 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<int8_t
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+               combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
+               framework::dataset::make("StrideX", { 1 })),
+               framework::dataset::make("StrideY", { 1 })),
+               framework::dataset::make("PadX", { 1 })),
+               framework::dataset::make("PadY", { 1 })),
+               framework::dataset::make("KernelSize", { 3 })),
+               framework::dataset::make("NumKernels", { 3 })),
+               framework::dataset::make("DataType",  DataType::QASYMM8_SIGNED)),
+               framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255, 10))),
+               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
                combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ),
@@ -329,9 +510,48 @@ TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // NHWC
 
+TEST_SUITE(NCHW)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", {
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, DataLayout::NCHW), // Unsupported kernel width
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, DataLayout::NCHW), // Non-rectangular weights dimensions are unsupported
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, DataLayout::NCHW)  // Unsupported stride
+                                                     }),
+               framework::dataset::make("WeightsInfo",{
+                                                        TensorInfo(TensorShape(11U, 11U, 2U, 4U), 1, DataType::F32, DataLayout::NCHW),
+                                                        TensorInfo(TensorShape(5U, 3U, 2U, 4U), 1, DataType::F32, DataLayout::NCHW),
+                                                        TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32, DataLayout::NCHW)
+                                                     })),
+               framework::dataset::make("BiasesInfo",{
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NCHW),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NCHW),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NCHW)
+                                                     })),
+               framework::dataset::make("OutputInfo",{
+                                                       TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32, DataLayout::NCHW),
+                                                       TensorInfo(TensorShape(23U, 11U, 4U), 1, DataType::F32, DataLayout::NCHW),
+                                                       TensorInfo(TensorShape(25U, 11U, 4U), 1, DataType::F32, DataLayout::NCHW)
+                                                     })),
+               framework::dataset::make("ConvInfo",  {
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(3, 3, 0, 0)
+                                                      })),
+                       framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})),
+               framework::dataset::make("Expected", { false, false, false})),
+               input_info, weights_info, biases_info, output_info, conv_info, act_info, expected)
+{
+    bool is_valid = bool(CLDirectConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, act_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
-TEST_SUITE(NCHW)
+
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit, framework::dataset::make("DataType", DataType::F16)),
@@ -356,13 +576,21 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerFixture<float>, framewo
                                                                                                                     ActivationFunctionsDataset),
                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
-    validate(CLAccessor(_target), _reference, tolerance_fp32);
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.0, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLDirectConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit,
+                       framework::dataset::make("DataType",
+                                                DataType::F32)),
+                       ActivationFunctionsDataset),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.0, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly, framework::dataset::make("DataType", DataType::F32)),
                                                                                                                   ActivationFunctionsDataset),
                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
-    validate(CLAccessor(_target), _reference, tolerance_fp32);
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.0, abs_tolerance_f32);
 }
 TEST_SUITE_END() // FP32
 
@@ -372,91 +600,202 @@ FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesFixture
                        ActivationFunctionsDataset))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_fp32);
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.0, abs_tolerance_f32);
 }
 TEST_SUITE_END() // FP32_CustomDataset
 TEST_SUITE_END() // Float
 
+/// @note: Every quantized test has a version with or without activation because the quantization info given is
+/// ignored when there is no activation. Instead of using the same quantization information for all the tensors, the
+/// fixture generates separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, these two versions should be merged
+/// again, with the explicitly specified quantization info removed
 const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 {
-    ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
 });
+const auto NoActivation = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo()
+});
+const auto IgnoredQuantizationInfo = framework::dataset::make("IgnoredQuantizationInfo",
+{
+    QuantizationInfo()
+});
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(data_precommit,
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) }),
+                       QuantizedActivationFunctionsDataset,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::QASYMM8),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) }),
+                       QuantizedActivationFunctionsDataset,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit_9x9,
                        framework::dataset::make("DataType",
-                                                DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
-                       QuantizedActivationFunctionsDataset),
+                                                DataType::QASYMM8),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(data_precommit_9x9,
+FIXTURE_DATA_TEST_CASE(RunSmall9x9WithActivation, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(data_precommit_9x9,
                        framework::dataset::make("DataType",
-                                                DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(3.f / 255, 10), QuantizationInfo(1.1f, 10) })),
-                       QuantizedActivationFunctionsDataset),
+                                                DataType::QASYMM8),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(3.f / 255, 10), QuantizationInfo(1.1f, 10) }),
+                       QuantizedActivationFunctionsDataset,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(data_nightly, framework::dataset::make("DataType",
-                       DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
-                       QuantizedActivationFunctionsDataset),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(data_nightly, framework::dataset::make("DataType",
+                       DataType::QASYMM8),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(data_nightly_9x9,
+FIXTURE_DATA_TEST_CASE(RunLargeWithActivation, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(data_nightly, framework::dataset::make("DataType",
+                       DataType::QASYMM8),
+                       framework::dataset::make("QuantizationInfoIf", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) }),
+                       QuantizedActivationFunctionsDataset,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(data_nightly_9x9,
                        framework::dataset::make("DataType",
-                                                DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(3.f / 255, 10), QuantizationInfo(1.1f, 10) })),
-                       QuantizedActivationFunctionsDataset),
+                                                DataType::QASYMM8),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-
-TEST_SUITE_END() // QASYMM8
-
-TEST_SUITE(QASYMM8_CustomDataset)
-FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::DirectConvolutionLayerDataset(),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), QuantizationInfo(1.1f, 10) })),
-                                       QuantizedActivationFunctionsDataset),
+FIXTURE_DATA_TEST_CASE(RunLarge9x9WithActivation, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(data_nightly_9x9,
+                       framework::dataset::make("DataType",
+                                                DataType::QASYMM8),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(3.f / 255, 10), QuantizationInfo(1.1f, 10) }),
+                       QuantizedActivationFunctionsDataset,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(CustomDataset, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::DirectConvolutionLayerDataset(),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8),
+                                               IgnoredQuantizationInfo,
+                                       NoActivation,
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(CustomDatasetWithActivation, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::DirectConvolutionLayerDataset(),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), QuantizationInfo(1.1f, 10) }),
+                                       QuantizedActivationFunctionsDataset,
                                framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END() // QASYMM8_CustomDataset
+TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(data_precommit, framework::dataset::make("DataType",
-                                                                                                                        DataType::QASYMM8_SIGNED)),
-                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, -10) })),
-                                                                                                                        QuantizedActivationFunctionsDataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit, framework::dataset::make("DataType",
+                                                                                                                        DataType::QASYMM8_SIGNED),
+                                                                                                                        IgnoredQuantizationInfo,
+                                                                                                                        NoActivation,
                                                                                                                         framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-
-FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(data_precommit_9x9,
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit, framework::dataset::make("DataType",
+                                                                                                                        DataType::QASYMM8_SIGNED),
+                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, -10) }),
+                                                                                                                        QuantizedActivationFunctionsDataset,
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit,
+                       framework::dataset::make("DataType",
+                                                DataType::QASYMM8_SIGNED),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLDirectConvolutionLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit,
+                       framework::dataset::make("DataType",
+                                                DataType::QASYMM8_SIGNED),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.1f, -10) }),
+                       QuantizedActivationFunctionsDataset,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit_9x9,
                        framework::dataset::make("DataType",
-                                                DataType::QASYMM8_SIGNED)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
-                       QuantizedActivationFunctionsDataset),
+                                                DataType::QASYMM8_SIGNED),
+                       IgnoredQuantizationInfo,
+                       NoActivation,
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall9x9WithActivation, CLDirectConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(data_precommit_9x9,
+                       framework::dataset::make("DataType",
+                                                DataType::QASYMM8_SIGNED),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) }),
+                       QuantizedActivationFunctionsDataset,
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
@@ -464,10 +803,21 @@ FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<int
 }
 
 FIXTURE_DATA_TEST_CASE(RunCustomDataset, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::DirectConvolutionLayerDataset(),
-                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), QuantizationInfo(1.1f, 10) })),
-                                       QuantizedActivationFunctionsDataset),
+                       combine(datasets::DirectConvolutionLayerDataset(),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED),
+                                               IgnoredQuantizationInfo,
+                                       NoActivation,
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunCustomDatasetWithActivation, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::DirectConvolutionLayerDataset(),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), QuantizationInfo(1.1f, 10) }),
+                                       QuantizedActivationFunctionsDataset,
                                framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
diff --git a/tests/validation/CL/ElementwiseMax.cpp b/tests/validation/CL/ElementwiseMax.cpp
index b9444b2795..bd47c23256 100644
--- a/tests/validation/CL/ElementwiseMax.cpp
+++ b/tests/validation/CL/ElementwiseMax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@ const auto ElementwiseMaxQASYMM8SignedDataset = combine(combine(framework::datas
 const auto ElementwiseMaxQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16), framework::dataset::make("DataType", DataType::QSYMM16)),
                                                   framework::dataset::make("DataType",
                                                                            DataType::QSYMM16));
-const auto ElementwiseMaxS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
+const auto ElementwiseMaxS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
                                               framework::dataset::make("DataType", DataType::S16));
 const auto ElementwiseMaxFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
                                                framework::dataset::make("DataType", DataType::F16));
@@ -71,6 +71,8 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.75f, 0.25f),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.75f, 0.25f)
 });
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(CL)
@@ -80,21 +82,18 @@ TEST_SUITE(ElementwiseMax)
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("OutputInfo",{TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false})),
+               framework::dataset::make("Expected", { true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLElementwiseMax::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
@@ -107,7 +106,8 @@ using CLElementwiseMaxFixture = ElementwiseMaxValidationFixture<CLTensor, CLAcce
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseMaxU8Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseMaxU8Dataset),
+                                                                                                              OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -115,7 +115,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFixture<uint8_t>, framework::Da
 TEST_SUITE_END()
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxS16Dataset),
+                                                                                                        OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -128,33 +129,36 @@ using CLElementwiseMaxQuantizedFixture = ElementwiseMaxValidationQuantizedFixtur
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMaxQASYMM8Dataset),
                                                                                                                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                                                                                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                                                                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32, 0.01);
 }
 TEST_SUITE_END()
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                       ElementwiseMaxQASYMM8SignedDataset),
                                                                                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                                                                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                      framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                                                                                                                      framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                                                                                                                      OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END()
 TEST_SUITE(QSYMM16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMaxQSYMM16Dataset),
                                                                                                                        framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
                                                                                                                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                                                                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })))
+                                                                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -167,13 +171,16 @@ using CLElementwiseMaxFloatFixture = ElementwiseMaxValidationFloatFixture<CLTens
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxFP16Dataset), EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ElementwiseMaxFP16Dataset),
+                                                                                                                  EmptyActivationFunctionsDataset),
+                                                                                                          OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMaxFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), ElementwiseMaxFP16Dataset),
-                                                                                                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMaxFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ElementwiseMaxFP16Dataset),
+                                                                                                                   ActivationFunctionsDataset),
+                                                                                                                   OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
@@ -181,14 +188,16 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMaxFloatFixture<half>, fr
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxFP32Dataset),
-                                                                                                           EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ElementwiseMaxFP32Dataset),
+                                                                                                                   EmptyActivationFunctionsDataset),
+                                                                                                           OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMaxFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), ElementwiseMaxFP32Dataset),
-                                                                                                                    ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMaxFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ElementwiseMaxFP32Dataset),
+                                                                                                                    ActivationFunctionsDataset),
+                                                                                                                    OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -197,16 +206,18 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMaxFloatFixture<float>, f
 template <typename T>
 using CLElementwiseMaxBroadcastFloatFixture = ElementwiseMaxBroadcastValidationFloatFixture<CLTensor, CLAccessor, CLElementwiseMax, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLElementwiseMaxBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLElementwiseMaxBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapesBroadcast(),
                        ElementwiseMaxFP32Dataset),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwiseMaxBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwiseMaxBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapesBroadcast(),
                        ElementwiseMaxFP32Dataset),
-                       ActivationFunctionsDataset))
+                       ActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
diff --git a/tests/validation/CL/ElementwiseMin.cpp b/tests/validation/CL/ElementwiseMin.cpp
index 8f53b241ab..ee229a0941 100644
--- a/tests/validation/CL/ElementwiseMin.cpp
+++ b/tests/validation/CL/ElementwiseMin.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@ const auto ElementwiseMinQASYMM8SignedDataset = combine(combine(framework::datas
 const auto ElementwiseMinQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16), framework::dataset::make("DataType", DataType::QSYMM16)),
                                                   framework::dataset::make("DataType",
                                                                            DataType::QSYMM16));
-const auto ElementwiseMinS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
+const auto ElementwiseMinS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
                                               framework::dataset::make("DataType", DataType::S16));
 const auto ElementwiseMinFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
                                                framework::dataset::make("DataType", DataType::F16));
@@ -71,6 +71,8 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.75f, 0.25f),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.75f, 0.25f)
 });
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(CL)
@@ -80,21 +82,18 @@ TEST_SUITE(ElementwiseMin)
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false})),
+               framework::dataset::make("Expected", { true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLElementwiseMin::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
@@ -107,7 +106,8 @@ using CLElementwiseMinFixture = ElementwiseMinValidationFixture<CLTensor, CLAcce
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseMinU8Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseMinU8Dataset),
+                                                                                                              OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -115,7 +115,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFixture<uint8_t>, framework::Da
 TEST_SUITE_END()
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinS16Dataset),
+                                                                                                        OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -128,33 +129,36 @@ using CLElementwiseMinQuantizedFixture = ElementwiseMinValidationQuantizedFixtur
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMinQASYMM8Dataset),
                                                                                                                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                                                                                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                                                                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32, 0.01);
 }
 TEST_SUITE_END()
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                       ElementwiseMinQASYMM8SignedDataset),
                                                                                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                                                                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                      framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                                                                                                                      framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                                                                                                                      OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END()
 TEST_SUITE(QSYMM16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMinQSYMM16Dataset),
                                                                                                                        framework::dataset::make("SrcQInfo0", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
                                                                                                                        framework::dataset::make("SrcQInfo1", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                                                                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })))
+                                                                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -167,13 +171,16 @@ using CLElementwiseMinFloatFixture = ElementwiseMinValidationFloatFixture<CLTens
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinFP16Dataset), EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ElementwiseMinFP16Dataset),
+                                                                                                                  EmptyActivationFunctionsDataset),
+                                                                                                          OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMinFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), ElementwiseMinFP16Dataset),
-                                                                                                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMinFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ElementwiseMinFP16Dataset),
+                                                                                                                   ActivationFunctionsDataset),
+                                                                                                                   OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
@@ -181,14 +188,16 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMinFloatFixture<half>, fr
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinFP32Dataset),
-                                                                                                           EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ElementwiseMinFP32Dataset),
+                                                                                                                   EmptyActivationFunctionsDataset),
+                                                                                                           OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMinFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), ElementwiseMinFP32Dataset),
-                                                                                                                    ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMinFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ElementwiseMinFP32Dataset),
+                                                                                                                    ActivationFunctionsDataset),
+                                                                                                                    OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -196,16 +205,18 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMinFloatFixture<float>, f
 template <typename T>
 using CLElementwiseMinBroadcastFloatFixture = ElementwiseMinBroadcastValidationFloatFixture<CLTensor, CLAccessor, CLElementwiseMin, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLElementwiseMinBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLElementwiseMinBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapesBroadcast(),
                        ElementwiseMinFP32Dataset),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwiseMinBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwiseMinBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapesBroadcast(),
                        ElementwiseMinFP32Dataset),
-                       ActivationFunctionsDataset))
+                       ActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
diff --git a/tests/validation/CL/ElementwisePower.cpp b/tests/validation/CL/ElementwisePower.cpp
index a2d3ba6c09..c2aeb6e045 100644
--- a/tests/validation/CL/ElementwisePower.cpp
+++ b/tests/validation/CL/ElementwisePower.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,8 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.75f, 0.25f),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.75f, 0.25f)
 });
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(CL)
@@ -96,29 +98,33 @@ using CLElementwisePowerBroadcastFloatFixture = ElementwisePowerBroadcastValidat
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwisePowerFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwisePowerFP16Dataset),
-                                                                                                            EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwisePowerFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ElementwisePowerFP16Dataset),
+                                                                                                                    EmptyActivationFunctionsDataset),
+                                                                                                            OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwisePowerFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), ElementwisePowerFP16Dataset),
-                                                                                                                     ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwisePowerFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ElementwisePowerFP16Dataset),
+                                                                                                                     ActivationFunctionsDataset),
+                                                                                                                     OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLElementwisePowerBroadcastFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLElementwisePowerBroadcastFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapesBroadcast(),
                        ElementwisePowerFP16Dataset),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwisePowerBroadcastFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwisePowerBroadcastFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapesBroadcast(),
                        ElementwisePowerFP16Dataset),
-                       ActivationFunctionsDataset))
+                       ActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
@@ -126,29 +132,33 @@ FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwisePowerBroadcastFl
 TEST_SUITE_END() //FP16
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwisePowerFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwisePowerFP32Dataset),
-                                                                                                             EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwisePowerFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ElementwisePowerFP32Dataset),
+                                                                                                                     EmptyActivationFunctionsDataset),
+                                                                                                             OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwisePowerFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), ElementwisePowerFP32Dataset),
-                                                                                                                      ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwisePowerFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ElementwisePowerFP32Dataset),
+                                                                                                                      ActivationFunctionsDataset),
+                                                                                                                      OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLElementwisePowerBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLElementwisePowerBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapesBroadcast(),
                        ElementwisePowerFP32Dataset),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwisePowerBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwisePowerBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapesBroadcast(),
                        ElementwisePowerFP32Dataset),
-                       ActivationFunctionsDataset))
+                       ActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
diff --git a/tests/validation/CL/ElementwiseSquaredDiff.cpp b/tests/validation/CL/ElementwiseSquaredDiff.cpp
index 0a4ab6627b..ee0279df33 100644
--- a/tests/validation/CL/ElementwiseSquaredDiff.cpp
+++ b/tests/validation/CL/ElementwiseSquaredDiff.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,7 @@ const auto ElementwiseSquaredDiffQASYMM8Dataset = combine(combine(framework::dat
 const auto ElementwiseSquaredDiffQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16), framework::dataset::make("DataType", DataType::QSYMM16)),
                                                           framework::dataset::make("DataType",
                                                                                    DataType::QSYMM16));
-const auto ElementwiseSquaredDiffS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
+const auto ElementwiseSquaredDiffS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
                                                       framework::dataset::make("DataType", DataType::S16));
 const auto ElementwiseSquaredDiffFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
                                                        framework::dataset::make("DataType", DataType::F16));
@@ -70,6 +70,8 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.75f, 0.25f),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.75f, 0.25f)
 });
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(CL)
@@ -79,21 +81,18 @@ TEST_SUITE(ElementwiseSquaredDiff)
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false})),
+               framework::dataset::make("Expected", { true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLElementwiseSquaredDiff::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
@@ -106,7 +105,8 @@ using CLElementwiseSquaredDiffFixture = ElementwiseSquaredDiffValidationFixture<
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseSquaredDiffU8Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffU8Dataset),
+                                                                                                                      OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -114,7 +114,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFixture<uint8_t>, frame
 TEST_SUITE_END()
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffS16Dataset),
+                                                                                                                OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -127,22 +128,24 @@ using CLElementwiseSquaredDiffQuantizedFixture = ElementwiseSquaredDiffValidatio
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQASYMM8Dataset),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32, 0.01);
 }
 TEST_SUITE_END()
 TEST_SUITE(QSYMM16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQSYMM16Dataset),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })))
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qsymm16);
@@ -155,14 +158,16 @@ using CLElementwiseSquaredDiffFloatFixture = ElementwiseSquaredDiffValidationFlo
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP16Dataset),
-                                                                                                                  EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP16Dataset),
+                                                                                                                  EmptyActivationFunctionsDataset),
+                                                                                                                  OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseSquaredDiffFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), ElementwiseSquaredDiffFP16Dataset),
-                       ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseSquaredDiffFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ElementwiseSquaredDiffFP16Dataset),
+                       ActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, 0.01);
@@ -170,14 +175,16 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseSquaredDiffFloatFixture<h
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP32Dataset),
-                                                                                                                   EmptyActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP32Dataset),
+                                                                                                                   EmptyActivationFunctionsDataset),
+                                                                                                                   OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseSquaredDiffFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), ElementwiseSquaredDiffFP32Dataset),
-                       ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseSquaredDiffFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ElementwiseSquaredDiffFP32Dataset),
+                       ActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -185,16 +192,18 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseSquaredDiffFloatFixture<f
 template <typename T>
 using CLElementwiseSquaredDiffBroadcastFloatFixture = ElementwiseSquaredDiffBroadcastValidationFloatFixture<CLTensor, CLAccessor, CLElementwiseSquaredDiff, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLElementwiseSquaredDiffBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLElementwiseSquaredDiffBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapesBroadcast(),
                        ElementwiseSquaredDiffFP32Dataset),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwiseSquaredDiffBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLElementwiseSquaredDiffBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapesBroadcast(),
                        ElementwiseSquaredDiffFP32Dataset),
-                       ActivationFunctionsDataset))
+                       ActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
diff --git a/tests/validation/CL/ExpLayer.cpp b/tests/validation/CL/ExpLayer.cpp
index 16e75a64b4..1797046e5d 100644
--- a/tests/validation/CL/ExpLayer.cpp
+++ b/tests/validation/CL/ExpLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/FFT.cpp b/tests/validation/CL/FFT.cpp
index fb2f1f53e2..99a83abe5c 100644
--- a/tests/validation/CL/FFT.cpp
+++ b/tests/validation/CL/FFT.cpp
@@ -175,6 +175,8 @@ TEST_SUITE(FFTConvolutionLayer)
 
 template <typename T>
 using CLFFTConvolutionLayerFixture = FFTConvolutionValidationFixture<CLTensor, CLAccessor, CLFFTConvolutionLayer, T>;
+template <typename T>
+using CLFFTConvolutionLayerMixedDataLayoutFixture = FFTConvolutionValidationFixture<CLTensor, CLAccessor, CLFFTConvolutionLayer, T, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
@@ -186,6 +188,14 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLFFTConvolutionLayerFixture<float>, framework:
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFFTConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFFTConvolutionLayerDataset(),
+                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                 ActivationFunctionsSmallDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num_f32);
+}
 TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLFFTConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFFTConvolutionLayerDataset(),
diff --git a/tests/validation/CL/FullyConnectedLayer.cpp b/tests/validation/CL/FullyConnectedLayer.cpp
index 78195a556b..2f0c86499b 100644
--- a/tests/validation/CL/FullyConnectedLayer.cpp
+++ b/tests/validation/CL/FullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -51,24 +52,20 @@ constexpr float                     tolerance_num = 0.07f;      /**< Tolerance n
 /** Tolerance for quantized asymmetric operations */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 
-/** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
-{
-    DataType::F16,
-    DataType::F32,
-    DataType::QASYMM8,
-    DataType::QASYMM8_SIGNED,
-});
-
-const auto FullyConnectedParameters = combine(framework::dataset::make("TransposeWeights", { false, true }), framework::dataset::make("ReshapeWeights", { false, true }));
+const auto FullyConnectedParameters = combine(make("TransposeWeights", { false, true }), make("ReshapeWeights", { false, true }));
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(1.f / 255.f, 10),
     QuantizationInfo(1.1f, 10),
 });
 
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto IgnoredQuantizationData = make("IgnoredQuantizationInfo",
+{
+    QuantizationInfo(),
+});
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
@@ -77,13 +74,16 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH)
 });
 
-const auto ActivationFunctionsQuantizedDataset = framework::dataset::make("ActivationInfo",
+// This dataset case only runs with dynamic quantization
+const auto NoActivationFunctionsQuantizedDataset = make("ActivationInfo",
 {
-    ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f)
+    ActivationLayerInfo()
 });
+
+const auto ActivationFunctionsQuantizedDataset = concat(concat(
+                                                        make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                                                        make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f))),
+                                                        make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f)));
 } // namespace
 
 TEST_SUITE(CL)
@@ -92,33 +92,33 @@ TEST_SUITE(FullyConnectedLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
+    make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Invalid weights dimensions
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Wrongly reshaped weights
                                           }),
-    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
+    make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 231U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 315U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
+    make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
+    make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                            })),
-    framework::dataset::make("TransposeWeights",{ true, true, false, true, true })),
-    framework::dataset::make("ReshapedWeights",{ false, false, false, false, false})),
-    framework::dataset::make("Expected", { false, true, true, false, false })),
+    make("TransposeWeights",{ true, true, false, true, true })),
+    make("ReshapedWeights",{ false, false, false, false, false})),
+    make("Expected", { false, true, true, false, false })),
     input_info, weights_info, bias_info, output_info, transpose_weights, reshaped_weights, expected)
 {
     // Create Fully Connected layer info
@@ -138,37 +138,73 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
 
 template <typename T>
 using CLFullyConnectedLayerFixture = FullyConnectedLayerValidationFixture<CLTensor, CLAccessor, CLFullyConnectedLayer, T>;
+template <typename T>
+using CLFullyConnectedLayerMixedDataLayoutFixture = FullyConnectedLayerValidationFixture<CLTensor, CLAccessor, CLFullyConnectedLayer, T, true>;
+template <typename T>
+using CLFullyConnectedLayerDynamicWeightsFixture = FullyConnectedWithDynamicWeightsFixture<CLTensor, CLAccessor, CLFullyConnectedLayer, T>;
+template <typename T>
+using CLFullyConnectedNoBiasFixture = FullyConnectedDynamicNoBiasFixture<CLTensor, CLAccessor, CLFullyConnectedLayer, T>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                                                                                                                        FullyConnectedParameters),
-                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                        FullyConnectedParameters,
+                                                                                                                        make("DataType", DataType::F16),
                                                                                                                 ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(),
-                                                                                                                      FullyConnectedParameters),
-                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(),
+                                                                                                                      FullyConnectedParameters,
+                                                                                                                      make("DataType", DataType::F16),
                                                                                                               ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F16),
+                       make("ActivationInfo", ActivationLayerInfo()),
+                       make("WeightsReshaped", { false, true })))
+{
+}
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                 make("DataType", DataType::F32),
                                                                                                                  ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(
+                           make("Input", TensorShape(9U, 5U, 7U)),
+                           make("Weights", TensorShape(315U, 271U)),
+                       make("Biases", TensorShape(271U)),
+                       make("Output", TensorShape(271U)),
+                       FullyConnectedParameters,
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo()),
+                       make("WeightsReshaped", { false, true })))
+{
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicNoBias, CLFullyConnectedNoBiasFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", { ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }),
+                       make("WeightsReshaped", { false })))
+{
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                make("DataType", DataType::F32),
                                                                                                                ActivationFunctionsDataset))
 {
     // Validate output
@@ -179,37 +215,141 @@ TEST_SUITE_END()
 
 template <typename T>
 using CLFullyConnectedLayerQuantizedFixture = FullyConnectedLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLFullyConnectedLayer, T>;
+template <typename T>
+using CLFullyConnectedLayerQuantizedMixedDataLayoutFixture = FullyConnectedLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLFullyConnectedLayer, T, true>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8)), QuantizationData),
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8), QuantizationData,
                                ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8)), QuantizationData),
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeWithActivation, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                            combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8), QuantizationData,
                                ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+
+// Dynamic Quantization Tests
+FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8), IgnoredQuantizationData,
+                        NoActivationFunctionsQuantizedDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                        combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8), IgnoredQuantizationData,
+                               NoActivationFunctionsQuantizedDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                        NoActivationFunctionsQuantizedDataset,
+                       make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
+{
+}
+
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                    combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       IgnoredQuantizationData,
+                               NoActivationFunctionsQuantizedDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE_END() /* QASYMM8 */
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), QuantizationData),
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, CLFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8_SIGNED), QuantizationData,
                                ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END() /* QASYMM8_SIGNED */
-TEST_SUITE_END() /* Quantized */
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, CLFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                            combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+// Dynamic Quantization tests below
+FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                            combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters, make("DataType", DataType::QASYMM8_SIGNED), IgnoredQuantizationData,
+                               NoActivationFunctionsQuantizedDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                    combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       IgnoredQuantizationData,
+                               NoActivationFunctionsQuantizedDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, CLFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       make("ActivationInfo", ActivationLayerInfo()),
+                       make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
+{
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicNoBias, CLFullyConnectedNoBiasFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       make("ActivationInfo", ActivationLayerInfo()),
+                       make("WeightsReshaped", { false /* COMPMID-6000: Support FullyConnected with quantized dynamic weights already reshaped */ })))
+{
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // FullyConnectedLayer
+TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/CL/GEMM.cpp b/tests/validation/CL/GEMM.cpp
index 838920c29d..16ca14f1d6 100644
--- a/tests/validation/CL/GEMM.cpp
+++ b/tests/validation/CL/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,29 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 TEST_SUITE(CL)
 TEST_SUITE(GEMM)
 
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type
+                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),
+                                                     }),
+               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32),
+                                                        TensorInfo(TensorShape(8U, 27U), 1, DataType::F32),
+                                                     })),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32),
+                                                        TensorInfo(TensorShape(8U, 13U), 1, DataType::F32),
+                                                     })),
+               framework::dataset::make("Expected", { false, true })),
+               lhs_info, rhs_info, output_info, expected)
+{
+    constexpr float alpha = 1.0;
+    constexpr float beta = 0.0;
+    const auto gemm_info = GEMMInfo();
+    bool is_valid = bool(CLGEMM::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), alpha, beta, gemm_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
 template <typename T>
 using CLGEMMFixture = GEMMValidationFixture<CLTensor, CLAccessor, CLGEMM, T>;
 
@@ -71,6 +94,9 @@ using CLGEMMOutput3DFixture = GEMMValidationFixture<CLTensor, CLAccessor, CLGEMM
 template <typename T>
 using CLGEMMInputOutput3DFixture = GEMMValidationFixture<CLTensor, CLAccessor, CLGEMM, T, false, true, true>;
 
+template <typename T>
+using CLBatchedMatMulFixture = GEMMValidationFixture<CLTensor, CLAccessor, CLGEMM, T, true, false, false, false, false, true>;
+
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
@@ -181,10 +207,32 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMOutput3DFixture<half>, framework::Dataset
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 TEST_SUITE_END() // FP16
-
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // OUTPUT_3D
 
+TEST_SUITE(BATCHED_MATMUL)
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLBatchedMatMulFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
+                                                                                                                   framework::dataset::make("ReshapeWeights", { false })),
+                                                                                                           framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLBatchedMatMulFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
+                                                                                                                  framework::dataset::make("ReshapeWeights", { false })),
+                                                                                                          framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END()
+TEST_SUITE_END() // BATCHED_MATMUL
+
 TEST_SUITE_END() // GEMM
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
index 5a1971b54c..78d794a9bb 100644
--- a/tests/validation/CL/GEMMLowp.cpp
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,9 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
+
 namespace
 {
 constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
@@ -53,6 +56,7 @@ TEST_SUITE(GEMMLowp)
 
 TEST_SUITE(MatrixMultiplyCore)
 using CLGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore>;
+using CLGEMMLowpBatchedMatMulFixture      = GEMMLowpMatrixMultiplyCoreValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, true>;
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
@@ -66,18 +70,74 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFixture, framework:
     validate(CLAccessor(_target), _reference);
 }
 
+using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;
+TEST_SUITE(BatchedMatMul)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
+{
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8
+
+using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8_SIGNED }),
+        make("reshape_b_only_on_first_run", { false })))
+{
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // BatchedMatMul
+
 TEST_SUITE(FusedOffsetOutput)
 TEST_SUITE(QASYMM8)
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
+TEST_SUITE(Output3D)
+using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputOutput3DUint8Fixture =
+    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, true>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputOutput3DUint8Fixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { true, false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // Output3D
+
+TEST_SUITE(InputOutput3D)
+using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInputOutput3DUint8Fixture =
+    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, true, true>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInputOutput3DUint8Fixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { true, false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // InputOutput3D
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
@@ -86,8 +146,10 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture =
     GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputInt8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputInt8Dataset(),
+        make("DataType", { DataType::QASYMM8_SIGNED }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
@@ -130,24 +192,24 @@ TEST_SUITE(QuantizeDownInt32Scale)
 
 TEST_SUITE(QASYMM8)
 
-const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
-                                                      3)
-                                                      * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_uint8_scale_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2) * make("result_shift", 2, 3)
+                                                      * make("min", 0) * make("max", 255) * make("addBias", { false, true });
 
-const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
-                                                           2)
-                                                           * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 173) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_uint8_scale_relu_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2)
+                                                           * make("result_shift", 2, 3) * make("min", 0, 2) * make("max", 171, 173) * make("addBias", { false, true });
 
 using CLGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage>;
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
 TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -158,24 +220,24 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 
-const auto quantize_down_int32_to_int8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
-                                                     3)
-                                                     * framework::dataset::make("min", -128) * framework::dataset::make("max", 127) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_int8_scale_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2) * make("result_shift", 2, 3)
+                                                     * make("min", -128) * make("max", 127) * make("addBias", { false, true });
 
-const auto quantize_down_int32_to_int8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
-                                                          2)
-                                                          * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", -100, -98) * framework::dataset::make("max", 71, 73) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_int8_scale_relu_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2)
+                                                          * make("result_shift", 2, 3) * make("min", -100, -98) * make("max", 71, 73) * make("addBias", { false, true });
 
 using CLGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToInt8ScaleValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage>;
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_cases))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_cases))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
 TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_relu_cases))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -185,140 +247,6 @@ TEST_SUITE_END() // BoundedReLu
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // QuantizeDownInt32Scale
 
-TEST_SUITE(QuantizeDownInt32ScaleByFixedPoint)
-
-TEST_SUITE(QASYMM8)
-
-const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                    2)
-                                                                    * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                         2)
-                                                                         * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
-using CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QASYMM8
-TEST_SUITE(QASYMM8_SIGNED)
-const auto quantize_down_int32_to_int8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, 2)
-                                                                   * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128) * framework::dataset::make("max", 127) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, 2)
-                                                                        * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128, -126) * framework::dataset::make("max", 110, 112) * framework::dataset::make("addBias", { false, true });
-using CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int8_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QASYMM8_SIGNED
-TEST_SUITE(QSYMM16)
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                    2)
-                                                                    * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                         2)
-                                                                         * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823,
-                                                                                                        1073741825)
-                                                                               * framework::dataset::make("result_shift", -3,
-                                                                                                          -2)
-                                                                               * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600,
-                                                                                                             254601602)
-                                                                                    * framework::dataset::make("result_shift", -3,
-                                                                                                               -1)
-                                                                                    * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-
-using CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
-
-TEST_SUITE(NoRelu)
-TEST_SUITE(MultSmallerEq1)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // MultSmallerEq1
-TEST_SUITE(MultGreater1)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // MultGreater1
-TEST_SUITE_END() // NoRelu
-TEST_SUITE(BoundedReLu)
-TEST_SUITE(MultSmallerEq1)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // MultSmallerEq1
-TEST_SUITE(MultGreater1)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // MultGreater1
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QSYMM16
-TEST_SUITE_END() // QuantizeDownInt32ScaleByFixedPoint
-
 TEST_SUITE(QuantizeDownInt32ScaleByFloat)
 
 TEST_SUITE(QASYMM8)
@@ -326,13 +254,14 @@ using CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture =
     GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage, uint8_t>;
 
 FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(combine(framework::dataset::make("DataType", DataType::QASYMM8),
-                                                                       datasets::TinyShapes()),
-                                                               framework::dataset::make("result_real_multiplier", 0.33f)),
-                                                       framework::dataset::make("result_offset", 2, 3)),
-                                               framework::dataset::make("min", 0)),
-                                       framework::dataset::make("max", 255)),
-                               framework::dataset::make("addBias", { false, true })))
+    combine(
+        make("DataType", DataType::QASYMM8),
+        datasets::TinyShapes(),
+        make("result_real_multiplier", 0.33f),
+        make("result_offset", 2, 3),
+        make("min", 0),
+        make("max", 255),
+        make("addBias", { false, true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -343,13 +272,14 @@ TEST_SUITE(QASYMM8_SIGNED)
 using CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture_Signed =
     GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage, int8_t>;
 FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture_Signed, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED),
-                                                                       datasets::TinyShapes()),
-                                                               framework::dataset::make("result_real_multiplier", 0.33f)),
-                                                       framework::dataset::make("result_offset", 2, 3)),
-                                               framework::dataset::make("min", -128)),
-                                       framework::dataset::make("max", 127)),
-                               framework::dataset::make("addBias", { false, true })))
+    combine(
+        make("DataType", DataType::QASYMM8_SIGNED),
+        datasets::TinyShapes(),
+        make("result_real_multiplier", 0.33f),
+        make("result_offset", 2, 3),
+        make("min", -128),
+        make("max", 127),
+        make("addBias", { false, true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -363,4 +293,4 @@ TEST_SUITE_END() // GEMMLowp
 TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
index 1057af95f2..d0d06a8ddb 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/framework/Asserts.h"
@@ -41,7 +41,7 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 // Create function for CLGEMMMatrixMultiplyNativeKernel
-using CLGEMMLowpMatrixMultiplyNative = CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyNativeKernel>;
+using CLGEMMLowpMatrixMultiplyNative = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyNativeKernel>;
 
 // Fixture for CLGEMMLowpMatrixMultiplyNative
 using CLGEMMLowpMatrixMultiplyNativeFixture = GEMMLowpMatrixMultiplyNativeValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyNative>;
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
index 4873a291ab..88455bdeb8 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/framework/Asserts.h"
@@ -42,14 +42,14 @@ namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
 
-// Create function for CLGEMMReshapeLHSMatrixKernel
-using CLGEMMReshapeLHSMatrix = CLSynthetizeFunction<CLGEMMReshapeLHSMatrixKernel>;
+// Create function for ClGemmReshapeLhsMatrixKernel
+using CLGEMMReshapeLHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmReshapeLhsMatrixKernel>;
 
-// Create function for CLGEMMReshapeRHSMatrixKernel
-using CLGEMMReshapeRHSMatrix = CLSynthetizeFunction<CLGEMMReshapeRHSMatrixKernel>;
+// Create function for ClGemmReshapeRhsMatrixKernel
+using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmReshapeRhsMatrixKernel>;
 
-// Create function for CLGEMMMatrixMultiplyReshapedKernel
-using CLGEMMLowpMatrixMultiplyReshaped = CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedKernel>;
+// Create function for CLGEMMLowpMatrixMultiplyReshapedKernel
+using CLGEMMLowpMatrixMultiplyReshaped = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedKernel>;
 
 // Fixture for CLGEMMLowpMatrixMultiplyReshaped
 using CLGEMMLowpMatrixMultiplyReshapedFixture = GEMMLowpMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshaped>;
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
index fa256280ca..c56901effc 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -46,10 +46,10 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 // Create function for CLGEMMReshapeRHSMatrixKernel
-using CLGEMMReshapeRHSMatrix = CLSynthetizeFunction<CLGEMMReshapeRHSMatrixKernel>;
+using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmReshapeRhsMatrixKernel>;
 
 // Create function for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>;
+using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>;
 
 // Fixture for CLGEMMLowpMatrixMultiplyReshapedOnlyRHS
 using CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFixture = GEMMLowpMatrixMultiplyReshapedOnlyRHSValidationFixture<CLTensor, CLAccessor, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshapedOnlyRHS>;
@@ -157,7 +157,7 @@ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned
 
     // Create and configure function
     CLGEMMLowpMatrixMultiplyReshapedOnlyRHS gemm;
-    gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info);
+    gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info);
 }
 } // namespace
 
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRhsMMUL.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRhsMMUL.cpp
new file mode 100644
index 0000000000..a0d13c3e39
--- /dev/null
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRhsMMUL.cpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/CL/Helper.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/fixtures/GEMMLowpFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using namespace arm_compute::opencl::kernels;
+
+// Create function for CLGEMMReshapeRHSMatrixKernel
+using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmReshapeRhsMatrixKernel>;
+
+// Create function for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>;
+
+// Fixture for CLGEMMLowpMatrixMultiplyReshapedOnlyRHS
+using CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULFixture =
+    GEMMLowpMatrixMultiplyReshapedOnlyRHSMMULValidationFixture<CLTensor, CLAccessor, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshapedOnlyRHS>;
+
+// Fixture for CLGEMMLowpMatrixMultiplyReshapedOnlyRHS
+using CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageFixtureSigned =
+    GEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageValidationFixture<int8_t, CLTensor, CLAccessor, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshapedOnlyRHS, CLReductionOperation, CLCast>;
+
+using CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageFixtureUnsigned =
+    GEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageValidationFixture<uint8_t, CLTensor, CLAccessor, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshapedOnlyRHS, CLReductionOperation, CLCast>;
+
+namespace
+{
+// *INDENT-OFF*
+// clang-format off
+
+/** M values to test */
+const auto m_values = framework::dataset::make("M", {16, 49});
+
+/** N values to test */
+const auto n_values = framework::dataset::make("N", {16, 259});
+
+/** K values to test */
+const auto k_values = framework::dataset::make("K", {192});
+
+/** Batch size values to test */
+const auto b_values = framework::dataset::make("batch_size", {1, 2});
+
+/** M0 values to test - Precommit */
+const auto m0 = framework::dataset::make("M0", {1, 2, 4});
+
+/** N0 values to test - Precommit */
+const auto n0 = framework::dataset::make("N0", { 1, 4, 8});
+
+/** K0 values to test - Precommit */
+const auto k0 = framework::dataset::make("K0", { 4 });
+
+/** H0 values to test - Precommit */
+const auto h0 = framework::dataset::make("H0", 1);
+
+/** Interleave values to test with RHS matrix */
+const auto i_values_rhs = framework::dataset::make("interleave_rhs", { false });
+
+/** Transpose values to test with RHS matrix */
+const auto t_values_rhs = framework::dataset::make("transpose_rhs", { true });
+
+const auto broadcast_bias = framework::dataset::make("broadcast_bias", {true, false});
+
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(GEMMLowpMatrixMultiplyReshapedOnlyRhsMMUL)
+FIXTURE_DATA_TEST_CASE(Signed, CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULFixture, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0),
+                                                                   n0),
+                                                                   k0),
+                                                                   h0),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                    framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })))
+{
+    // Validate output
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+FIXTURE_DATA_TEST_CASE(Unsigned, CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULFixture, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0),
+                                                                   n0),
+                                                                   k0),
+                                                                   h0),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                    framework::dataset::make("DataType", { DataType::QASYMM8})))
+{
+    // Validate output
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+FIXTURE_DATA_TEST_CASE(OutputStageSigned, CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageFixtureSigned, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0),
+                                                                   n0),
+                                                                   k0),
+                                                                   h0),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   broadcast_bias),
+                    framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED})))
+{
+    // Validate output
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+FIXTURE_DATA_TEST_CASE(OutputStageUnsigned, CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageFixtureUnsigned, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0),
+                                                                   n0),
+                                                                   k0),
+                                                                   h0),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   broadcast_bias),
+                    framework::dataset::make("DataType", { DataType::QASYMM8})))
+{
+    // Validate output
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // GEMMLowpMatrixMultiplyReshapedOnlyRhsMMUL
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/tests/validation/CL/GEMMMatrixMultiply.cpp b/tests/validation/CL/GEMMMatrixMultiply.cpp
deleted file mode 100644
index fdf7f503ec..0000000000
--- a/tests/validation/CL/GEMMMatrixMultiply.cpp
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/CL/Helper.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/GEMMFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-using namespace arm_compute::misc::shape_calculator;
-
-// Create function for CLGEMMMatrixMultiplyKernel
-using CLGEMMMatrixMultiplyNative = CLSynthetizeFunction<CLGEMMMatrixMultiplyKernel>;
-
-// Fixture for GEMMMatrixMultiplyValidationFixture
-template <typename T>
-using CLGEMMMatrixMultiplyNativeFixture = GEMMMatrixMultiplyValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
-
-// Fixture for GEMMMatrixMultiply3DValidationFixture
-template <typename T>
-using CLGEMMMatrixMultiplyNative3DFixture = GEMMMatrixMultiply3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
-
-namespace
-{
-// *INDENT-OFF*
-// clang-format off
-RelativeTolerance<float> rel_tolerance_f32(0.001f);
-constexpr float          abs_tolerance_f32(0.0001f);
-
-RelativeTolerance<half> rel_tolerance_f16(half(0.2));
-constexpr float         tolerance_num_f16 = 0.02f;
-
-/** Alpha values to test */
-const auto alpha_values = framework::dataset::make("alpha", {1.0f, -0.75f} );
-
-/** Beta values to test */
-const auto beta_values = framework::dataset::make("beta", {-0.35f, 0.0f} );
-
-/** M, N combinations to test
- *  1: Special 1x1 case
- *  2: Special multples of processor size in both dimensions
- *  3: Non multiples of processor size in both dimensions
- *  4: Special 1x1003 case
-*/
-const auto m_n_values = zip(
-    framework::dataset::make("M", {1, 16, 37, 1}),
-    framework::dataset::make("N", {1, 16, 51, 1003})
-    );
-
-/** N values to test */
-const auto n_values = framework::dataset::make("N", {51, 1003});
-
-/** K values to test */
-const auto k_values = framework::dataset::make("K", 23);
-
-/** M_W values to test */
-const auto m_w_values = framework::dataset::make("M_W", 5);
-
-/** M_H values to test */
-const auto m_h_values = framework::dataset::make("M_H", 7);
-
-/** Batch size values to test */
-const auto b_values = framework::dataset::make("batch_size", 1, 3);
-
-/** Activation values to test */
-const auto act_values = framework::dataset::make("Activation",
-{
-    ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, 2.f),
-});
-
-/** Broadcast bias from vector to matrix */
-const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", { false, true } );
-
-/** GPU architectures values to test */
-const auto gpu_arch_values = framework::dataset::make("GPUArch",
-{
-    GPUTarget::MIDGARD,
-    GPUTarget::BIFROST
-});
-
-/** Data types values to test in the configuration */
-const auto data_type_values = framework::dataset::make("DataType",
-{
-    DataType::F32,
-    DataType::F16
-});
-
-/** M values to test */
-const auto fp16_mixed_precision_values = framework::dataset::make("fp16_mixed_precision", {true, false});
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(GEMMMatrixMultiply)
-TEST_CASE(Negative, framework::DatasetMode::ALL)
-{
-    // Unsupported QASYMM8 data type
-    {
-        const auto lhs                       = TensorInfo(TensorShape(13U, 12U, 1U, 1U), 1, DataType::QASYMM8);
-        const auto rhs                       = TensorInfo(TensorShape(14U, 13U, 1U, 1U), 1, DataType::QASYMM8);
-        const auto out                       = TensorInfo(TensorShape(14U, 12U, 1U, 1U), 1, DataType::QASYMM8);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = false;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(12, 14, 13, 1, 1, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, nullptr, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Unsupported SIZE_T data type
-    {
-        const auto lhs                       = TensorInfo(TensorShape(13U, 12U, 1U, 1U), 1, DataType::SIZET);
-        const auto rhs                       = TensorInfo(TensorShape(14U, 13U, 1U, 1U), 1, DataType::SIZET);
-        const auto out                       = TensorInfo(TensorShape(14U, 12U, 1U, 1U), 1, DataType::SIZET);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = false;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(12, 14, 13, 1, 1, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, nullptr, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Mixed precision with F32
-    {
-        const auto lhs                       = TensorInfo(TensorShape(13U, 12U, 1U, 1U), 1, DataType::F32);
-        const auto rhs                       = TensorInfo(TensorShape(14U, 13U, 1U, 1U), 1, DataType::F32);
-        const auto out                       = TensorInfo(TensorShape(14U, 12U, 1U, 1U), 1, DataType::F32);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = false;
-        const GEMMReshapeInfo reshape_info  = GEMMReshapeInfo(12, 14, 13, 1, 1, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const bool fp_mixed_precision        = true;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, nullptr, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target, fp_mixed_precision);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Max number of dimensions LHS matrix
-    {
-        const auto lhs                       = TensorInfo(TensorShape(13U, 12U, 1U, 1U, 4U), 1, DataType::F32);
-        const auto rhs                       = TensorInfo(TensorShape(14U, 13U, 1U, 1U), 1, DataType::F32);
-        const auto out                       = TensorInfo(TensorShape(14U, 12U, 1U, 1U), 1, DataType::F32);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = false;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(12, 14, 13, 1, 1, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, nullptr, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Max number of dimensions RHS matrix
-    {
-        const auto lhs                       = TensorInfo(TensorShape(13U, 12U, 1U, 4U), 1, DataType::F32);
-        const auto rhs                       = TensorInfo(TensorShape(14U, 13U, 1U, 4U), 1, DataType::F32);
-        const auto out                       = TensorInfo(TensorShape(14U, 12U, 1U, 4U), 1, DataType::F32);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = false;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(12, 14, 13, 1, 1, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, nullptr, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Broadcast bias
-    {
-        const auto lhs                       = TensorInfo(TensorShape(13U, 12U, 1U, 1U), 1, DataType::F16);
-        const auto rhs                       = TensorInfo(TensorShape(14U, 13U, 1U, 1U), 1, DataType::F16);
-        // The correct shape should be bias = TensorInfo(TensorShape(14U, 1U, 1U, 1U), 1, DataType::F32);
-        const auto bias                      = TensorInfo(TensorShape(14U, 12U, 1U, 1U), 1, DataType::F16);
-        const auto out                       = TensorInfo(TensorShape(14U, 12U, 1U, 1U), 1, DataType::F16);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = false;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(12, 14, 13, 1, 1, 0, false, true);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const bool fp_mixed_precision        = false;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, &bias, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target, fp_mixed_precision);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Invalid dimensions for the bias
-    {
-        const auto lhs                       = TensorInfo(TensorShape(13U, 12U, 1U, 1U), 1, DataType::F32);
-        const auto rhs                       = TensorInfo(TensorShape(14U, 13U, 1U, 1U), 1, DataType::F32);
-        // The correct shape should be bias = TensorInfo(TensorShape(14U, 12U, 1U, 1U), 1, DataType::F32);
-        const auto bias                      = TensorInfo(TensorShape(14U, 8U, 1U, 1U), 1, DataType::F32);
-        const auto out                       = TensorInfo(TensorShape(14U, 12U, 1U, 1U), 1, DataType::F32);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = false;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(12, 14, 13, 1, 1, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const bool fp_mixed_precision        = false;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, &bias, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target, fp_mixed_precision);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Invalid dimensions for the output
-    {
-        const auto lhs                       = TensorInfo(TensorShape(13U, 12U, 1U, 1U), 1, DataType::F32);
-        const auto rhs                       = TensorInfo(TensorShape(14U, 13U, 1U, 1U), 1, DataType::F32);
-        // The correct shape should be out = TensorInfo(TensorShape(14U, 12U, 1U, 1U), 1, DataType::F32);
-        const auto out                       = TensorInfo(TensorShape(14U, 7U, 1U, 1U), 1, DataType::F32);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = false;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(12, 14, 13, 1, 1, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, nullptr, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-}
-
-TEST_SUITE(Float)
-TEST_SUITE(FP32)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_n_values,
-                                                                   k_values),
-                                                                   b_values),
-                                                                   alpha_values),
-                                                                   beta_values),
-                                                                   broadcast_bias_values),
-                                                                   framework::dataset::make("fp16_mixed_precision", false)),
-                                                                   act_values),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   gpu_arch_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyNative3DFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_w_values,
-                                                                   m_h_values),
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   alpha_values),
-                                                                   beta_values),
-                                                                   broadcast_bias_values),
-                                                                   framework::dataset::make("fp16_mixed_precision", false)),
-                                                                   act_values),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   gpu_arch_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-TEST_SUITE_END() // FP32
-
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyNativeFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_n_values,
-                                                                   k_values),
-                                                                   b_values),
-                                                                   alpha_values),
-                                                                   beta_values),
-                                                                   broadcast_bias_values),
-                                                                   fp16_mixed_precision_values),
-                                                                   act_values),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   gpu_arch_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyNative3DFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_w_values,
-                                                                   m_h_values),
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   alpha_values),
-                                                                   beta_values),
-                                                                   broadcast_bias_values),
-                                                                   fp16_mixed_precision_values),
-                                                                   act_values),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   gpu_arch_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16);
-}
-
-TEST_SUITE_END() // FP16
-TEST_SUITE_END() // Float
-TEST_SUITE_END() // GEMMMatrixMuliplty
-TEST_SUITE_END() // CL
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp b/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp
deleted file mode 100644
index d6507a06c4..0000000000
--- a/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/CL/Helper.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/GEMMFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-using namespace arm_compute::misc::shape_calculator;
-
-// Create function for CLGEMMReshapeLHSMatrixKernel
-using CLGEMMReshapeLHSMatrix = CLSynthetizeFunction<CLGEMMReshapeLHSMatrixKernel>;
-
-// Create function for CLGEMMReshapeRHSMatrixKernel
-using CLGEMMReshapeRHSMatrix = CLSynthetizeFunction<CLGEMMReshapeRHSMatrixKernel>;
-
-// Create function for CLGEMMMatrixMultiplyKernel
-using CLGEMMMatrixMultiplyReshaped = CLSynthetizeFunction<CLGEMMMatrixMultiplyKernel>;
-
-// Fixture for GEMMMatrixMultiplyInterleavedTransposedValidationFixture
-template <typename T>
-using CLGEMMMatrixMultiplyReshapedFixture =
-    GEMMMatrixMultiplyInterleavedTransposedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
-
-// Fixture for GEMMMatrixMultiplyInterleavedTransposed3DValidationFixture
-template <typename T>
-using CLGEMMMatrixMultiplyReshaped3DFixture =
-    GEMMMatrixMultiplyInterleavedTransposed3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
-
-namespace
-{
-// *INDENT-OFF*
-// clang-format off
-RelativeTolerance<float> rel_tolerance_f32(0.001f);
-constexpr float          abs_tolerance_f32(0.0001f);
-
-RelativeTolerance<half> rel_tolerance_f16(half(0.2));
-constexpr float         tolerance_num_f16 = 0.02f;
-
-/** Alpha values to test */
-const auto alpha_values = framework::dataset::make("alpha", {1.0f, -0.75f} );
-
-/** Beta values to test */
-const auto beta_values = framework::dataset::make("beta", {-0.35f, 0.0f} );
-
-/** M, N combinations to test
- *  1: Special 1x1 case
- *  2: Special multples of processor size in both dimensions
- *  3: Non multiples of processor size in both dimensions
-*/
-const auto m_n_values = zip(
-    framework::dataset::make("M", {1, 16, 37}),
-    framework::dataset::make("N", {1, 16, 51})
-    );
-
-/** N values to test */
-const auto n_values = framework::dataset::make("N", 51);
-
-/** K values to test */
-const auto k_values = framework::dataset::make("K", 23);
-
-/** M_W values to test */
-const auto m_w_values = framework::dataset::make("M_W", 5);
-
-/** M_H values to test */
-const auto m_h_values = framework::dataset::make("M_H", 7);
-
-/** Batch size values to test */
-const auto b_values = framework::dataset::make("batch_size", 1, 3);
-
-/** Activation values to test */
-const auto act_values = framework::dataset::make("Activation",
-{
-    ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, 2.f),
-});
-
-/** V0 values to test */
-const auto v0_values = framework::dataset::make("V0", 2);
-
-/** H0 values to test */
-const auto h0_values = framework::dataset::make("H0", 4);
-
-/** Broadcast bias from vector to matrix */
-const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", {false, true} );
-
-/** GPU architectures values to test */
-const auto gpu_arch_values = framework::dataset::make("GPUArch",
-{
-    GPUTarget::MIDGARD,
-    GPUTarget::BIFROST
-});
-
-/** Data types values to test in the configuration */
-const auto data_type_values = framework::dataset::make("DataType",
-{
-    DataType::F32,
-    DataType::F16
-});
-
-/** M values to test */
-const auto fp16_mixed_precision_values = framework::dataset::make("fp16_mixed_precision", {true, false});
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(GEMMMatrixMultiplyInterleavedTransposed)
-TEST_CASE(Negative, framework::DatasetMode::ALL)
-{
-    // The following tests are already integrated in the GEMMMatrixMultiply validation because
-    // in common with this validation
-    // - Unsupported QASYMM8 data type
-    // - Unsupported SIZE_T data type
-    // - Mixed precision with F32
-    // - Max number of dimensions LHS matrix
-    // - Max number of dimensions RHS matrix
-
-    // Invalid LHS dimensions
-    {
-        // The correct shape should be: lhs = TensorInfo(TensorShape(256U, 1U, 1U, 1U), 1, DataType::F32);
-        const auto lhs                       = TensorInfo(TensorShape(256U, 2U, 1U, 1U), 1, DataType::F32);
-        const auto rhs                       = TensorInfo(TensorShape(104U, 3U, 1U, 1U), 1, DataType::F32);
-        const auto bias                      = TensorInfo(TensorShape(24U, 16U, 1U, 1U), 1, DataType::F32);
-        const auto out                       = TensorInfo(TensorShape(24U, 16U, 1U, 1U), 1, DataType::F32);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = true;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(16, 24, 13, 2, 4, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const bool fp_mixed_precision        = false;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, &bias, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target, fp_mixed_precision);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Invalid RHS dimensions
-    {
-        const auto lhs                       = TensorInfo(TensorShape(256U, 1U, 1U, 1U), 1, DataType::F32);
-        // The correct shape should be rhs = TensorInfo(TensorShape(104U, 3U, 1U, 1U), 1, DataType::F32);
-        const auto rhs                       = TensorInfo(TensorShape(104U, 4U, 1U, 1U), 1, DataType::F32);
-        const auto bias                      = TensorInfo(TensorShape(24U, 16U, 1U, 1U), 1, DataType::F32);
-        const auto out                       = TensorInfo(TensorShape(24U, 16U, 1U, 1U), 1, DataType::F32);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = true;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(16, 24, 13, 2, 4, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const bool fp_mixed_precision        = false;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, &bias, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target, fp_mixed_precision);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Broadcast bias
-    {
-        const auto lhs                       = TensorInfo(TensorShape(256U, 1U, 1U, 1U), 1, DataType::F32);
-        const auto rhs                       = TensorInfo(TensorShape(104U, 3U, 1U, 1U), 1, DataType::F32);
-        // The correct shape should be bias = TensorInfo(TensorShape(24U, 1U, 1U, 1U), 1, DataType::F32);
-        const auto bias                      = TensorInfo(TensorShape(24U, 16U, 1U, 1U), 1, DataType::F32);
-        const auto out                       = TensorInfo(TensorShape(24U, 16U, 1U, 1U), 1, DataType::F32);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = true;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(16, 24, 13, 2, 4, 0, false, true);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const bool fp_mixed_precision        = false;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, &bias, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target, fp_mixed_precision);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Invalid dimensions for the bias
-    {
-        const auto lhs                       = TensorInfo(TensorShape(256U, 1U, 1U, 1U), 1, DataType::F32);
-        const auto rhs                       = TensorInfo(TensorShape(104U, 3U, 1U, 1U), 1, DataType::F32);
-        // The correct shape should be bias = TensorInfo(TensorShape(24U, 16U, 1U, 1U), 1, DataType::F32);
-        const auto bias                      = TensorInfo(TensorShape(25U, 16U, 1U, 1U), 1, DataType::F32);
-        const auto out                       = TensorInfo(TensorShape(24U, 16U, 1U, 1U), 1, DataType::F32);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = true;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(16, 24, 13, 2, 4, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const bool fp_mixed_precision        = false;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, &bias, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target, fp_mixed_precision);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-
-    // Invalid dimensions for the output
-    {
-        const auto lhs                       = TensorInfo(TensorShape(256U, 1U, 1U, 1U), 1, DataType::F32);
-        const auto rhs                       = TensorInfo(TensorShape(104U, 3U, 1U, 1U), 1, DataType::F32);
-        const auto bias                      = TensorInfo(TensorShape(24U, 16U, 1U, 1U), 1, DataType::F32);
-        // The correct shape should be out = TensorInfo(TensorShape(24U, 16U, 1U, 1U), 1, DataType::F32);
-        const auto out                       = TensorInfo(TensorShape(24U, 13U, 1U, 1U), 1, DataType::F32);
-        constexpr float alpha                = 1.3f;
-        constexpr float beta                 = 0.7f;
-        const bool is_interleaved_transposed = true;
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(16, 24, 13, 2, 4, 0, false, false);
-        const GPUTarget gpu_target           = GPUTarget::MIDGARD;
-        const bool fp_mixed_precision        = false;
-        const auto status    = CLGEMMMatrixMultiplyKernel::validate(&lhs, &rhs, &bias, &out, alpha, beta, is_interleaved_transposed, reshape_info, gpu_target, fp_mixed_precision);
-        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
-    }
-}
-
-TEST_SUITE(Float)
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_n_values,
-                                                                   k_values),
-                                                                   b_values),
-                                                                   alpha_values),
-                                                                   beta_values),
-                                                                   v0_values),
-                                                                   h0_values),
-                                                                   broadcast_bias_values),
-                                                                   framework::dataset::make("fp16_mixed_precision", false)),
-                                                                   act_values),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   gpu_arch_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_w_values,
-                                                                   m_h_values),
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   alpha_values),
-                                                                   beta_values),
-                                                                   v0_values),
-                                                                   h0_values),
-                                                                   broadcast_bias_values),
-                                                                   framework::dataset::make("fp16_mixed_precision", false)),
-                                                                   act_values),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   gpu_arch_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-TEST_SUITE_END() // FP32
-
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_n_values,
-                                                                   k_values),
-                                                                   b_values),
-                                                                   alpha_values),
-                                                                   beta_values),
-                                                                   v0_values),
-                                                                   h0_values),
-                                                                   broadcast_bias_values),
-                                                                   fp16_mixed_precision_values),
-                                                                   act_values),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   gpu_arch_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_w_values,
-                                                                   m_h_values),
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   alpha_values),
-                                                                   beta_values),
-                                                                   v0_values),
-                                                                   h0_values),
-                                                                   broadcast_bias_values),
-                                                                   fp16_mixed_precision_values),
-                                                                   act_values),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   gpu_arch_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16);
-}
-
-TEST_SUITE_END() // FP16
-TEST_SUITE_END() // Float
-TEST_SUITE_END() // GEMMMatrixMulipltyInterleavedTransposed
-TEST_SUITE_END() // CL
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
index ec6b87fbae..0ddf43766f 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -44,9 +44,10 @@ namespace test
 namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::opencl::kernels;
 
-// Create function for CLGEMMMatrixMultiplyNativeKernel
-using CLGEMMMatrixMultiplyNative = CLSynthetizeFunction<CLGEMMMatrixMultiplyNativeKernel>;
+// Create function for ClGemmMatrixMultiplyNativeKernel
+using CLGEMMMatrixMultiplyNative = CLSynthetizeOperator<ClGemmMatrixMultiplyNativeKernel>;
 
 // Fixture for CLGEMMMatrixMultiplyNative
 template <typename T>
@@ -90,8 +91,8 @@ const auto b_values = framework::dataset::make("batch_size", 1, 3);
 /** Activation values to test */
 const auto act_values = framework::dataset::make("Activation",
 {
-    ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, 2.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
 });
 
 /** M0 values to test - Precommit */
@@ -184,7 +185,7 @@ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned
 
     // Create and configure function
     CLGEMMMatrixMultiplyNative gemm;
-    gemm.configure(&lhs, &rhs, &bias, &dst, 1.0f, 1.0f, lhs_info, rhs_info, kernel_info);
+    gemm.configure(lhs.info(), rhs.info(), bias.info(), dst.info(), 1.0f, 1.0f, lhs_info, rhs_info, kernel_info);
 }
 } // namespace
 
@@ -322,6 +323,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyNative3DFixture<float>, f
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // GEMMMatrixMulipltyNative
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
index 52afb716e4..b06e4bf213 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,9 +26,9 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -46,15 +46,16 @@ namespace test
 namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::opencl::kernels;
 
-// Create function for CLGEMMReshapeLHSMatrixKernel
-using CLGEMMReshapeLHSMatrix = CLSynthetizeFunction<CLGEMMReshapeLHSMatrixKernel>;
+// Create function for ClGemmReshapeLhsMatrixKernel
+using CLGEMMReshapeLHSMatrix = CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>;
 
-// Create function for CLGEMMReshapeRHSMatrixKernel
-using CLGEMMReshapeRHSMatrix = CLSynthetizeFunction<CLGEMMReshapeRHSMatrixKernel>;
+// Create function for ClGemmReshapeRhsMatrixKernel
+using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<ClGemmReshapeRhsMatrixKernel>;
 
-// Create function for CLGEMMMatrixMultiplyReshapedKernel
-using CLGEMMMatrixMultiplyReshaped = CLSynthetizeFunction<CLGEMMMatrixMultiplyReshapedKernel>;
+// Create function for ClGemmMatrixMultiplyReshapedKernel
+using CLGEMMMatrixMultiplyReshaped = CLSynthetizeOperator<ClGemmMatrixMultiplyReshapedKernel>;
 
 // Fixture for CLGEMMMatrixMultiplyReshaped
 template <typename T>
@@ -109,6 +110,7 @@ const auto b_values = framework::dataset::make("batch_size", 2, 3);
 const auto act_values = framework::dataset::make("Activation",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, 2.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
 });
 
 /** Alpha values to test - Precommit */
@@ -327,7 +329,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                framework::dataset::make("Expected", { true, true, false, false, false, true, true,true})),
                     input0_info ,input1_info, input2_info, output_info, lhs_info, rhs_info, gemm_info, expected)
 {
-   ARM_COMPUTE_EXPECT(bool(CLGEMMMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
+    ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
                                                           &input1_info.clone()->set_is_resizable(true),
                                                           &input2_info.clone()->set_is_resizable(true),
                                                           &output_info.clone()->set_is_resizable(true),1.f,1.f,
@@ -335,6 +337,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                           rhs_info,
                                                           gemm_info)) == expected, framework::LogLevel::ERRORS);
 }
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
@@ -360,7 +363,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture<float>, fra
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture<float>, framework::DatasetMode::DISABLED,
@@ -385,7 +396,15 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture<float>, fra
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>, framework::DatasetMode::ALL,
@@ -410,7 +429,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>, framework::DatasetMode::DISABLED,
@@ -435,8 +462,17 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
+
 TEST_SUITE(ExportToCLImage)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
@@ -559,10 +595,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                       true,
                                                       true,
                                                       false,
-                                                      false})),
+                                                      true})),
                     input0_info ,input1_info, input2_info, output_info, lhs_info, rhs_info, gemm_info, expected)
 {
-   ARM_COMPUTE_EXPECT(bool(CLGEMMMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
+   ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
                                                           &input1_info.clone()->set_is_resizable(true),
                                                           &input2_info.clone()->set_is_resizable(true),
                                                           &output_info.clone()->set_is_resizable(true),1.f,1.f,
@@ -703,6 +739,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+
 TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP32
 
@@ -730,7 +767,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture<half>, fram
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture<half>, framework::DatasetMode::DISABLED,
@@ -755,7 +800,15 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture<half>, fram
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>, framework::DatasetMode::ALL,
@@ -780,7 +833,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>, framework::DatasetMode::DISABLED,
@@ -805,7 +866,15 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 TEST_SUITE(ExportToCLImage)
@@ -930,10 +999,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                       true,
                                                       true,
                                                       false,
-                                                      false})),
+                                                      true})),
                     input0_info ,input1_info, input2_info, output_info, lhs_info, rhs_info, gemm_info, expected)
 {
-   ARM_COMPUTE_EXPECT(bool(CLGEMMMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
+   ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
                                                           &input1_info.clone()->set_is_resizable(true),
                                                           &input2_info.clone()->set_is_resizable(true),
                                                           &output_info.clone()->set_is_resizable(true),1.f,1.f,
@@ -1074,6 +1143,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+
 TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP16
 
@@ -1101,7 +1171,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixtu
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture<half>, framework::DatasetMode::DISABLED,
@@ -1126,7 +1204,15 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixtu
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionFixture<half>, framework::DatasetMode::ALL,
@@ -1151,7 +1237,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionF
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionFixture<half>, framework::DatasetMode::DISABLED,
@@ -1176,8 +1270,17 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionF
                                                                    act_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
+
 TEST_SUITE_END() // MixedPrecision
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // GEMMMatrixMultiplyReshaped
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
index ebcecb8b78..dafc8dc5ec 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -45,12 +45,13 @@ namespace test
 namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::opencl::kernels;
 
-// Create function for CLGEMMReshapeRHSMatrixKernel
-using CLGEMMReshapeRHSMatrix = CLSynthetizeFunction<CLGEMMReshapeRHSMatrixKernel>;
+// Create function for ClGemmReshapeRhsMatrixKernel
+using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<ClGemmReshapeRhsMatrixKernel>;
 
-// Create function for CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-using CLGEMMMatrixMultiplyReshapedOnlyRHS = CLSynthetizeFunction<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>;
+// Create function for ClGemmMatrixMultiplyReshapedOnlyRhsKernel
+using CLGEMMMatrixMultiplyReshapedOnlyRHS = CLSynthetizeOperator<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>;
 
 // Fixture for CLGEMMMatrixMultiplyReshapedOnlyRHS
 template <typename T>
@@ -98,6 +99,7 @@ const auto b_values = framework::dataset::make("batch_size", 2);
 const auto act_values = framework::dataset::make("Activation",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 10.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
 });
 
 /** M0 values to test - precommit */
@@ -210,6 +212,7 @@ bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned
     CLGEMMMatrixMultiplyReshapedOnlyRHS gemm;
     return bool(gemm.validate(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info));
 }
+
 } // namespace
 
 TEST_SUITE(CL)
@@ -461,6 +464,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
@@ -589,6 +593,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+
 TEST_SUITE_END() // FP16
 
 TEST_SUITE_END() // Float
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRhsMMUL.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRhsMMUL.cpp
new file mode 100644
index 0000000000..3b3cf85317
--- /dev/null
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRhsMMUL.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/CL/Helper.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/GEMMFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using namespace arm_compute::opencl::kernels;
+
+// Create function for ClGemmReshapeRhsMatrixKernel
+using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<ClGemmReshapeRhsMatrixKernel>;
+
+// Create function for ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel
+using CLGEMMMatrixMultiplyReshapedOnlyRhsMMUL = CLSynthetizeOperator<ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel>;
+
+// Fixture for CLGEMMMatrixMultiplyReshapedOnlyRhsMMUL
+template <typename T>
+using CLGEMMMatrixMultiplyReshapedOnlyRhsMMULFixture = GEMMMatrixMultiplyReshapedOnlyRhsMMULValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshapedOnlyRhsMMUL>;
+
+namespace
+{
+// *INDENT-OFF*
+// clang-format off
+RelativeTolerance<float> rel_tolerance_f32(0.001f);
+constexpr float          abs_tolerance_f32(0.0001f);
+RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.001f));
+constexpr float          abs_tolerance_f16(0.3f);
+
+/** Alpha values to test - Precommit */
+const auto a_values = framework::dataset::make("alpha", {1.0f, 0.75f} );
+
+/** Beta values to test - Precommit */
+const auto beta_values = framework::dataset::make("beta", {0.0f, -0.75f} );
+
+/** M values to test */
+const auto m_values = framework::dataset::make("M", {49});
+
+/** N values to test */
+const auto n_values = framework::dataset::make("N", {257});
+
+/** K values to test */
+/** The test case requires this to be multiple of 4*/
+const auto k_values = framework::dataset::make("K", {192});
+
+/** Batch size values to test */
+const auto b_values = framework::dataset::make("batch_size", {1, 2});
+
+/** Activation values to test */
+const auto act_values = framework::dataset::make("Activation",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+});
+
+/** M0 values to test - Precommit */
+const auto m0_values_precommit = framework::dataset::make("M0", { 1, 2, 4 });
+
+/** N0 values to test - Precommit */
+const auto n0_values_precommit = framework::dataset::make("N0", { 4, 8 });
+
+/** K0 values to test - Precommit */
+const auto k0_values_precommit = framework::dataset::make("K0", { 1 });
+
+/** Broadcast bias from vector to matrix */
+const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", { false, true } );
+
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(GEMMMatrixMultiplyReshapedOnlyRhsMMUL)
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedOnlyRhsMMULFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   framework::dataset::make("ExportToCLImage", false)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   broadcast_bias_values),
+                                                                   act_values))
+{
+    // Validate output
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedOnlyRhsMMULFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   framework::dataset::make("ExportToCLImage", false)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   broadcast_bias_values),
+                                                                   act_values))
+{
+    // Validate output
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(ExportToCLImage)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedOnlyRhsMMULFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   framework::dataset::make("ExportToCLImage", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   broadcast_bias_values),
+                                                                   act_values))
+{
+    // Validate output
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedOnlyRhsMMULFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   framework::dataset::make("ExportToCLImage", true)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   broadcast_bias_values),
+                                                                   act_values))
+{
+    // Validate output
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // ExportToCLImage
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // GEMMMatrixMultiplyReshapedOnlyRhsMMUL
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/GEMMReshapeLHSMatrix.cpp b/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
index 34c37dffde..0dd9b811f6 100644
--- a/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
+++ b/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -43,9 +43,10 @@ namespace test
 namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::opencl::kernels;
 
 // Initialize the output tensor with zero and fill the border with zero
-using CLGEMMReshapeLHSMatrix = CLSynthetizeFunctionInitOutputWithZeroAndWithZeroConstantBorder<CLGEMMReshapeLHSMatrixKernel, 16>;
+using CLGEMMReshapeLHSMatrix = CLSynthetizeOperatorInitOutputWithZeroAndWithZeroConstantBorder<ClGemmReshapeLhsMatrixKernel, 16>;
 
 template <typename T>
 using CLGEMMReshapeLHSMatrixFixture = GEMMReshapeLHSMatrixValidationFixture<CLTensor, CLAccessor, CLGEMMReshapeLHSMatrix, T, false>;
@@ -65,8 +66,10 @@ const auto b_values = framework::dataset::make("batchsize", 1, 3);
 
 /** M0 values to test */
 const auto m0_values_s32 = framework::dataset::make("M0", { 2, 3 });
-const auto m0_values_s16 = framework::dataset::make("M0", { 4, 5 });
-const auto m0_values_s8 = framework::dataset::make("M0", { 6, 7, 8 });
+const auto m0_values_s16 = framework::dataset::make("M0", { 4 });
+const auto m0_values_s16_nt = framework::dataset::make("M0", { 5 });
+const auto m0_values_s8_nt = framework::dataset::make("M0", { 6,7 });
+const auto m0_values_s8 = framework::dataset::make("M0", { 8 });
 
 /** K0 values to test */
 const auto k0_values_s32 = framework::dataset::make("K0", { 2, 3 });
@@ -100,6 +103,7 @@ FIXTURE_DATA_TEST_CASE(S32, CLGEMMReshapeLHSMatrixFixture<int>, framework::Datas
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+
 FIXTURE_DATA_TEST_CASE(S16, CLGEMMReshapeLHSMatrixFixture<short>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
                                                                    b_values),
@@ -113,6 +117,7 @@ FIXTURE_DATA_TEST_CASE(S16, CLGEMMReshapeLHSMatrixFixture<short>, framework::Dat
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+
 FIXTURE_DATA_TEST_CASE(S8, CLGEMMReshapeLHSMatrixFixture<char>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
                                                                    b_values),
@@ -127,6 +132,37 @@ FIXTURE_DATA_TEST_CASE(S8, CLGEMMReshapeLHSMatrixFixture<char>, framework::Datas
     validate(CLAccessor(_target), _reference);
 }
 
+TEST_SUITE(NotTransposed)
+FIXTURE_DATA_TEST_CASE(S16, CLGEMMReshapeLHSMatrixFixture<short>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
+                                                                   b_values),
+                                                                   framework::dataset::make("DataType", DataType::S16)),
+                                                                   m0_values_s16_nt),
+                                                                   k0_values_s16),
+                                                                   v0_values),
+                                                                   i_values),
+                                                                   framework::dataset::make("transpose", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(S8, CLGEMMReshapeLHSMatrixFixture<char>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
+                                                                   b_values),
+                                                                   framework::dataset::make("DataType", DataType::S8)),
+                                                                   m0_values_s8_nt),
+                                                                   k0_values_s8),
+                                                                   v0_values),
+                                                                   i_values),
+                                                                   framework::dataset::make("transpose", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+TEST_SUITE_END()
+
 TEST_SUITE(ReinterpretInputAs3D)
 FIXTURE_DATA_TEST_CASE(S32, CLGEMMReshapeLHSMatrix3DFixture<int>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape3DShapes(),
diff --git a/tests/validation/CL/GEMMReshapeRHSMatrix.cpp b/tests/validation/CL/GEMMReshapeRHSMatrix.cpp
index 14048e81ec..f8462058a6 100644
--- a/tests/validation/CL/GEMMReshapeRHSMatrix.cpp
+++ b/tests/validation/CL/GEMMReshapeRHSMatrix.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -73,9 +73,10 @@ const auto i_values = framework::dataset::make("interleave", { true, false });
 } // namespace
 
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::opencl::kernels;
 
 // Initialize the output tensor with zero and fill the border with zero
-using CLGEMMReshapeRHSMatrix = CLSynthetizeFunctionInitOutputWithZeroAndWithZeroConstantBorder<CLGEMMReshapeRHSMatrixKernel, 16>;
+using CLGEMMReshapeRHSMatrix = CLSynthetizeOperatorInitOutputWithZeroAndWithZeroConstantBorder<ClGemmReshapeRhsMatrixKernel, 16>;
 
 template <typename T>
 using CLGEMMReshapeRHSMatrixFixture = GEMMReshapeRHSMatrixValidationFixture<CLTensor, CLAccessor, CLGEMMReshapeRHSMatrix, T>;
@@ -117,7 +118,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
     rhs_info.transpose = true;
     rhs_info.interleave = true;
 
-    bool has_error = bool(CLGEMMReshapeRHSMatrixKernel::validate(&input_info.clone()->set_is_resizable(false), (output_info.total_size() == 0) ? nullptr : &output_info.clone()->set_is_resizable(false), rhs_info));
+    bool has_error = bool(ClGemmReshapeRhsMatrixKernel::validate(&input_info.clone()->set_is_resizable(false), (output_info.total_size() == 0) ? nullptr : &output_info.clone()->set_is_resizable(false), rhs_info));
     ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
 }
 
@@ -158,9 +159,9 @@ DATA_TEST_CASE(ValidatePadding, framework::DatasetMode::ALL, combine(combine(com
         padding = round_up_width - output_shape[0];
     }
 
-    CLGEMMReshapeRHSMatrixKernel kernel;
+    ClGemmReshapeRhsMatrixKernel kernel;
 
-    kernel.configure(&input, &output, rhs_info);
+    kernel.configure(CLKernelLibrary::get().get_compile_context(), input.info(), output.info(), rhs_info);
 
     ARM_COMPUTE_EXPECT((output.info()->padding().right == padding), framework::LogLevel::ERRORS);
 }
diff --git a/tests/validation/CL/Gather.cpp b/tests/validation/CL/Gather.cpp
index f0b87d7d9f..7619baae1e 100644
--- a/tests/validation/CL/Gather.cpp
+++ b/tests/validation/CL/Gather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,19 +48,21 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
         framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 27U), 1, DataType::F16),
                                                 TensorInfo(TensorShape(27U, 27U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(27U, 27U), 1, DataType::F32),
-                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F32),     // Invalid Indices data type
-                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F32),     // Invalid Indices dimensionality
-                                                TensorInfo(TensorShape(5U, 5U, 5U, 5U, 5U), 1, DataType::F32),    // Invalid Input dimensionality
-                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F16),     // Mismatching data type input/output
-                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F32),     // Invalid positive axis value
-                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F16),     // Invalid negative axis value
+                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F32),                // Invalid Output shape
+                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F32),                // Invalid Indices data type
+                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F32),                // Invalid Indices dimensionality
+                                                TensorInfo(TensorShape(5U, 5U, 5U, 5U, 5U), 1, DataType::F32),      // Invalid Input dimensionality
+                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F16),                // Mismatching data type input/output
+                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F32),                // Invalid positive axis value
+                                                TensorInfo(TensorShape(27U, 27U), 1, DataType::F16),                // Invalid negative axis value
         }),
         framework::dataset::make("IndicesInfo", {
                                                 TensorInfo(TensorShape(10U), 1, DataType::U32),
                                                 TensorInfo(TensorShape(10U), 1, DataType::U32),
                                                 TensorInfo(TensorShape(10U), 1, DataType::U32),
-                                                TensorInfo(TensorShape(10U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(10U, 10U), 1, DataType::U32),
+                                                TensorInfo(TensorShape(10U), 1, DataType::U8),
+                                                TensorInfo(TensorShape(10U, 10U, 10U, 10U), 1, DataType::U32),
                                                 TensorInfo(TensorShape(10U), 1, DataType::U32),
                                                 TensorInfo(TensorShape(10U), 1, DataType::U32),
                                                 TensorInfo(TensorShape(10U), 1, DataType::U32),
@@ -71,7 +73,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                 TensorInfo(TensorShape(27U, 10U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(10U, 27U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(10U, 27U), 1, DataType::F32),
-                                                TensorInfo(TensorShape(27U, 10U), 1, DataType::F32),
+                                                TensorInfo(TensorShape(10U, 27U), 1, DataType::F32),
+                                                TensorInfo(TensorShape(27U, 10U, 10U, 10U, 10U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(10U, 5U, 5U, 5U, 5U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(27U, 10U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(27U, 27U), 1, DataType::F32),
@@ -82,13 +85,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                             1,
                                             -2,
                                             0,
+                                            0,
                                             1,
                                             0,
                                             1,
                                             2,
                                             -3,
         })),
-        framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, false })),
+        framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, false, false })),
         input_info, indices_info, output_info, axis, expected)
 {
     const Status status = CLGather::validate(&input_info.clone()->set_is_resizable(true), &indices_info.clone()->set_is_resizable(true), &output_info.clone()->set_is_resizable(true), axis);
@@ -111,6 +115,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
     validate(CLAccessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunSmallMultiDimIndices,
+                       CLGatherFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::CLSmallGatherMultiDimIndicesDataset(), framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLGatherFixture<half>,
                        framework::DatasetMode::NIGHTLY,
@@ -131,6 +144,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
     validate(CLAccessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunSmallMultiDimIndices,
+                       CLGatherFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::CLSmallGatherMultiDimIndicesDataset(), framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLGatherFixture<float>,
                        framework::DatasetMode::NIGHTLY,
@@ -152,6 +174,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
     validate(CLAccessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunSmallMultiDimIndices,
+                       CLGatherFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::CLSmallGatherMultiDimIndicesDataset(), framework::dataset::make("DataType", DataType::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLGatherFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
@@ -172,6 +203,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
     validate(CLAccessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunSmallMultiDimIndices,
+                       CLGatherFixture<uint16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::CLSmallGatherMultiDimIndicesDataset(), framework::dataset::make("DataType", DataType::U16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLGatherFixture<uint16_t>,
                        framework::DatasetMode::NIGHTLY,
diff --git a/tests/validation/CL/Im2Col.cpp b/tests/validation/CL/Im2Col.cpp
index c6006efcba..1f5b781690 100644
--- a/tests/validation/CL/Im2Col.cpp
+++ b/tests/validation/CL/Im2Col.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/gpu/cl/kernels/ClIm2ColKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/framework/Asserts.h"
@@ -40,7 +40,7 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(Im2Col)
 
-using CLIm2Col = CLSynthetizeFunction<CLIm2ColKernel>;
+using ClIm2Col = ClSynthetizeOperatorWithBorder<opencl::kernels::ClIm2ColKernel>;
 
 /** Negative tests
  *
@@ -63,7 +63,7 @@ TEST_CASE(Negative, framework::DatasetMode::ALL)
         const auto output    = TensorInfo(TensorShape(9U, 10U, 12U, 2U), 1, DataType::F32);
         const auto conv_size = Size2D(3, 3);
         const bool has_bias  = false;
-        const auto status    = CLIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias);
+        const auto status    = opencl::kernels::ClIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
 
@@ -73,7 +73,7 @@ TEST_CASE(Negative, framework::DatasetMode::ALL)
         const auto output    = TensorInfo(TensorShape(9U, 80U, 2U), 1, DataType::QASYMM8);
         const auto conv_size = Size2D(3, 3);
         const bool has_bias  = true;
-        const auto status    = CLIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias);
+        const auto status    = opencl::kernels::ClIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
 
@@ -84,7 +84,7 @@ TEST_CASE(Negative, framework::DatasetMode::ALL)
         const auto conv_size = Size2D(3, 3);
         const auto dilation  = Size2D(0, 1);
         const bool has_bias  = false;
-        const auto status    = CLIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias, dilation);
+        const auto status    = opencl::kernels::ClIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias, dilation);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
 
@@ -96,7 +96,7 @@ TEST_CASE(Negative, framework::DatasetMode::ALL)
         const auto         dilation   = Size2D(1, 1);
         const bool         has_bias   = false;
         const unsigned int num_groups = 2;
-        const auto         status     = CLIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias, dilation, num_groups);
+        const auto         status     = opencl::kernels::ClIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias, dilation, num_groups);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
 
@@ -108,7 +108,7 @@ TEST_CASE(Negative, framework::DatasetMode::ALL)
         const auto         dilation   = Size2D(1, 1);
         const bool         has_bias   = false;
         const unsigned int num_groups = 2;
-        const auto         status     = CLIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias, dilation, num_groups);
+        const auto         status     = opencl::kernels::ClIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias, dilation, num_groups);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
 
@@ -118,7 +118,7 @@ TEST_CASE(Negative, framework::DatasetMode::ALL)
         const auto output    = TensorInfo(TensorShape(9U, 81U, 2U), 1, DataType::F32);
         const auto conv_size = Size2D(3, 3);
         const bool has_bias  = false;
-        const auto status    = CLIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias);
+        const auto status    = opencl::kernels::ClIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
 
@@ -128,13 +128,13 @@ TEST_CASE(Negative, framework::DatasetMode::ALL)
         const auto output    = TensorInfo(TensorShape(1U, 1U, 1U, 2U), 1, DataType::F32, DataLayout::NHWC);
         const auto conv_size = Size2D(9, 9);
         const bool has_bias  = false;
-        const auto status    = CLIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias);
+        const auto status    = opencl::kernels::ClIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
 }
 
 template <typename T>
-using CLIm2ColFixture = Im2ColValidationFixture<CLTensor, CLAccessor, CLIm2Col, T, true>;
+using ClIm2ColFixture = Im2ColOpValidationFixture<CLTensor, CLAccessor, ClIm2Col, T, true>;
 
 TEST_SUITE(NHWC)
 
@@ -150,7 +150,7 @@ TEST_SUITE(NHWC)
  *  Kernel tested im2col3x3_nhwc
  */
 FIXTURE_DATA_TEST_CASE(W3x3,
-                       CLIm2ColFixture<float>,
+                       ClIm2ColFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape",
@@ -180,7 +180,7 @@ framework::dataset::make("Groups", 1)))
  *  Kernel tested im2col9x9_nhwc
  */
 FIXTURE_DATA_TEST_CASE(W9x9,
-                       CLIm2ColFixture<float>,
+                       ClIm2ColFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape",
@@ -210,7 +210,7 @@ framework::dataset::make("Groups", 1)))
  *  Kernel tested im2col_generic_nhwc
  */
 FIXTURE_DATA_TEST_CASE(Generic,
-                       CLIm2ColFixture<float>,
+                       ClIm2ColFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape",
@@ -243,7 +243,7 @@ TEST_SUITE(NCHW)
  *  Kernel tested im2col1x1_stridex1_nchw
  */
 FIXTURE_DATA_TEST_CASE(W1x1_Stride1_NoPad,
-                       CLIm2ColFixture<float>,
+                       ClIm2ColFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape", { TensorShape(4U, 4U, 3U, 2U), TensorShape(5U, 4U, 3U, 2U), TensorShape(3U, 4U, 3U, 2U) }),
@@ -267,7 +267,7 @@ FIXTURE_DATA_TEST_CASE(W1x1_Stride1_NoPad,
  *  Kernel tested im2col3x3_nchw
  */
 FIXTURE_DATA_TEST_CASE(W3x3,
-                       CLIm2ColFixture<float>,
+                       ClIm2ColFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape", TensorShape(4U, 4U, 3U, 2U)),
@@ -291,7 +291,7 @@ FIXTURE_DATA_TEST_CASE(W3x3,
  *  Kernel tested im2col5x5_nchw
  */
 FIXTURE_DATA_TEST_CASE(W5x5,
-                       CLIm2ColFixture<float>,
+                       ClIm2ColFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape", TensorShape(7U, 4U, 3U, 2U)),
@@ -317,7 +317,7 @@ FIXTURE_DATA_TEST_CASE(W5x5,
  * Kernel tested im2col11x11_padx0_pady0_nchw
  */
 FIXTURE_DATA_TEST_CASE(W11x11_NoPad,
-                       CLIm2ColFixture<float>,
+                       ClIm2ColFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape", { TensorShape(11U, 11U, 2U, 2U), TensorShape(14U, 13U, 1U, 2U) }),
@@ -341,7 +341,7 @@ FIXTURE_DATA_TEST_CASE(W11x11_NoPad,
  * Kernel tested im2col_generic_padx0_pady0_nchw
  */
 FIXTURE_DATA_TEST_CASE(GenericZeroPad,
-                       CLIm2ColFixture<float>,
+                       ClIm2ColFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape", TensorShape(13U, 11U, 2U, 2U)),
@@ -367,7 +367,7 @@ TEST_SUITE_END() // NCHW
  * Kernel tested im2col_generic_(nchw|nhwc)
  */
 FIXTURE_DATA_TEST_CASE(Generic,
-                       CLIm2ColFixture<float>,
+                       ClIm2ColFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape", TensorShape(13U, 11U, 5U, 2U)),
@@ -393,7 +393,7 @@ FIXTURE_DATA_TEST_CASE(Generic,
  *  - im2col9x9_nhwc
  */
 FIXTURE_DATA_TEST_CASE(Quantized,
-                       CLIm2ColFixture<uint8_t>,
+                       ClIm2ColFixture<uint8_t>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape", TensorShape(13U, 11U, 11U, 2U)),
@@ -419,7 +419,7 @@ FIXTURE_DATA_TEST_CASE(Quantized,
  *  - im2col9x9_nhwc
  */
 FIXTURE_DATA_TEST_CASE(FP16,
-                       CLIm2ColFixture<half>,
+                       ClIm2ColFixture<half>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape", TensorShape(13U, 11U, 11U, 2U)),
diff --git a/tests/validation/CL/Remap.cpp b/tests/validation/CL/IndirectConv2dAddressPrecalculation.cpp
index f73073105b..67f70685d1 100644
--- a/tests/validation/CL/Remap.cpp
+++ b/tests/validation/CL/IndirectConv2dAddressPrecalculation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,18 +22,19 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLRemap.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h"
 #include "tests/CL/CLAccessor.h"
+#include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/RemapFixture.h"
+#include "tests/validation/fixtures/IndirectConv2dAddressPrecalculationFixture.h"
 
 namespace arm_compute
 {
@@ -41,37 +42,48 @@ namespace test
 {
 namespace validation
 {
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::opencl::kernels;
+
+using CLIndirectConv2dAddressPrecalculation = CLSynthetizeOperator<ClIndirectConv2dAddressPrecalculationKernel>;
+
+using CLIndirectConv2dAddressPrecalculationFixture = IndirectConv2dAddressPrecalculationValidationFixture<CLTensor, CLAccessor, CLIndirectConv2dAddressPrecalculation>;
+
+// *INDENT-OFF*
+// clang-format off
+/** Data types */
+
 namespace
 {
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(1);
-constexpr float                      tolerance_number = 0.2f;
+const auto src_w_values  = framework::dataset::make("src_w", {91});
+const auto src_h_values  = framework::dataset::make("src_h", {103});
+const auto src_b_values  = framework::dataset::make("src_b", {1, 2});
+const auto wei_w_values  = framework::dataset::make("wei_w", {3, 5});
+const auto wei_h_values  = framework::dataset::make("wei_h", {1, 6});
+const auto pad_values    = framework::dataset::make("pad", {1, 2, 3});
+const auto stride_values = framework::dataset::make("stride", {1, 2});
+const auto m0_values     = framework::dataset::make("M0", { 1, 2, 4, 5, 7 });
 } // namespace
 
 TEST_SUITE(CL)
-TEST_SUITE(Remap)
-template <typename T>
-using CLRemapFixture = RemapValidationFixture<CLTensor, CLAccessor, CLRemap, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLRemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
+TEST_SUITE(IndirectConv2dAddressPrecalculation)
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLRemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIndirectConv2dAddressPrecalculationFixture, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(src_w_values,
+                                                                        src_h_values),
+                                                                        src_b_values),
+                                                                        wei_w_values),
+                                                                        wei_h_values),
+                                                                        pad_values),
+                                                                        stride_values),
+                                                                        m0_values))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
+    validate(CLAccessor(_target), _reference);
 }
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // IndirectConv2dAddressPrecalculation
+TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/CL/IndirectConvolutionLayer.cpp b/tests/validation/CL/IndirectConvolutionLayer.cpp
new file mode 100644
index 0000000000..aedf070e6b
--- /dev/null
+++ b/tests/validation/CL/IndirectConvolutionLayer.cpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/DirectConvolutionLayerFixture.h"
+
+// Note: Since the interface of indirect convolution is the same of direct convolution, we can reuse
+// the direct convolution fixture
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<half>  tolerance_fp16(half(0.2));  /**< Tolerance for floating point tests */
+RelativeTolerance<float> tolerance_fp32(0.05f);      /**< Tolerance for floating point tests */
+constexpr float          abs_tolerance_f32(0.0001f); /**< Absolute tolerance for FP32 tests*/
+constexpr float          tolerance_num = 0.07f;      /**< Tolerance number */
+
+/** Activation function Dataset*/
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f) });
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(IndirectConvolutionLayer)
+
+/** Check whether the configuration of a indirect convolution layer with no
+ * bias leads to a successful run.
+ */
+TEST_CASE(NoBias, framework::DatasetMode::PRECOMMIT)
+{
+    const TensorShape    src_shape_nhwc = TensorShape(8U, 27U, 13U);
+    const TensorShape    wei_shape_nhwc = TensorShape(8U, 3U, 3U, 4U);
+    const TensorShape    bia_shape      = TensorShape(4U);
+    const TensorShape    dst_shape_nhwc = TensorShape(4U, 25U, 11U);
+    constexpr DataType   dt             = DataType::F32;
+    constexpr DataLayout data_layout    = DataLayout::NHWC;
+
+    auto src_nhwc = create_tensor<CLTensor>(src_shape_nhwc, dt, 1, QuantizationInfo(), data_layout);
+    auto wei_nhwc = create_tensor<CLTensor>(wei_shape_nhwc, dt, 1, QuantizationInfo(), data_layout);
+    auto dst_nhwc = create_tensor<CLTensor>(dst_shape_nhwc, dt, 1, QuantizationInfo(), data_layout);
+
+    TensorShape src_shape_nchw = src_shape_nhwc;
+    TensorShape wei_shape_nchw = wei_shape_nhwc;
+    TensorShape dst_shape_nchw = dst_shape_nhwc;
+
+    permute(src_shape_nchw, PermutationVector(1U, 2U, 0U));
+    permute(wei_shape_nchw, PermutationVector(1U, 2U, 0U, 3U));
+    permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+    const PadStrideInfo conv_info = PadStrideInfo(1, 1, 0, 0);
+
+    // Create indirect Convolution function
+    CLIndirectConvolutionLayer conv{};
+    conv.configure(&src_nhwc, &wei_nhwc, nullptr, &dst_nhwc, conv_info);
+
+    src_nhwc.allocator()->allocate();
+    wei_nhwc.allocator()->allocate();
+    dst_nhwc.allocator()->allocate();
+
+    library->fill_tensor_value(CLAccessor(src_nhwc), 1.f);
+    library->fill_tensor_value(CLAccessor(wei_nhwc), 1.f);
+
+    conv.run();
+
+    // Compute reference to compare
+    SimpleTensor<float> ref_src{ src_shape_nchw, dt };
+    SimpleTensor<float> ref_wei{ wei_shape_nchw, dt };
+    SimpleTensor<float> ref_bia{ bia_shape, dt };
+    library->fill_tensor_value(ref_src, 1.f);
+    library->fill_tensor_value(ref_wei, 1.f);
+    // No bias
+    library->fill_tensor_value(ref_bia, 0.f);
+    auto ref_dst = reference::convolution_layer<float>(ref_src, ref_wei, ref_bia, dst_shape_nchw, conv_info);
+
+    validate(CLAccessor(dst_nhwc), ref_dst);
+}
+
+/** Check whether the case of rectangle kernels i.e. when width and height of the weight_shape are not equal
+ *  would lead to successful run
+ */
+TEST_CASE(NonSquareKernel, framework::DatasetMode::PRECOMMIT)
+{
+    const TensorShape    src_shape_nhwc = TensorShape(3U, 33U, 27U);
+    const TensorShape    wei_shape_nhwc = TensorShape(3U, 5U, 7U, 4U); // non-square kernel
+    const TensorShape    bia_shape      = TensorShape(4U);
+    const TensorShape    dst_shape_nhwc = TensorShape(4U, 11U, 12U);
+    constexpr DataType   dt             = DataType::F32;
+    constexpr DataLayout data_layout    = DataLayout::NHWC;
+
+    auto src_nhwc = create_tensor<CLTensor>(src_shape_nhwc, dt, 1, QuantizationInfo(), data_layout);
+    auto wei_nhwc = create_tensor<CLTensor>(wei_shape_nhwc, dt, 1, QuantizationInfo(), data_layout);
+    auto dst_nhwc = create_tensor<CLTensor>(dst_shape_nhwc, dt, 1, QuantizationInfo(), data_layout);
+
+    TensorShape src_shape_nchw = src_shape_nhwc;
+    TensorShape wei_shape_nchw = wei_shape_nhwc;
+    TensorShape dst_shape_nchw = dst_shape_nhwc;
+
+    permute(src_shape_nchw, PermutationVector(1U, 2U, 0U));
+    permute(wei_shape_nchw, PermutationVector(1U, 2U, 0U, 3U));
+    permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+    const PadStrideInfo conv_info = PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR);
+
+    // Create indirect convolution function
+    CLIndirectConvolutionLayer conv{};
+    conv.configure(&src_nhwc, &wei_nhwc, nullptr, &dst_nhwc, conv_info);
+
+    src_nhwc.allocator()->allocate();
+    wei_nhwc.allocator()->allocate();
+    dst_nhwc.allocator()->allocate();
+
+    library->fill_tensor_value(CLAccessor(src_nhwc), 1.f);
+    library->fill_tensor_value(CLAccessor(wei_nhwc), 1.f);
+
+    conv.run();
+
+    // Compute reference to compare
+    SimpleTensor<float> ref_src{ src_shape_nchw, dt };
+    SimpleTensor<float> ref_wei{ wei_shape_nchw, dt };
+    SimpleTensor<float> ref_bia{ bia_shape, dt };
+    library->fill_tensor_value(ref_src, 1.f);
+    library->fill_tensor_value(ref_wei, 1.f);
+    // No bias
+    library->fill_tensor_value(ref_bia, 0.f);
+    auto ref_dst = reference::convolution_layer<float>(ref_src, ref_wei, ref_bia, dst_shape_nchw, conv_info);
+
+    validate(CLAccessor(dst_nhwc), ref_dst);
+}
+// *INDENT-OFF*
+// clang-format off
+// Note: Since the interface of indirect convolution is the same of direct convolution, we can reuse
+// the direct convolution fixture
+template <typename T>
+using CLIndirectConvolutionLayerFixture = DirectConvolutionValidationFixture<CLTensor, CLAccessor, CLIndirectConvolutionLayer, T>;
+template <typename T>
+using CLIndirectConvolutionLayerMixedDataLayoutFixture = DirectConvolutionValidationFixture<CLTensor, CLAccessor, CLIndirectConvolutionLayer, T, true>;
+
+TEST_SUITE(NHWC)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIndirectConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+               combine(combine(combine(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
+               framework::dataset::make("StrideX", { 1, 3, 1, 1 })),
+               framework::dataset::make("StrideY", { 1, 3, 2, 1 })),
+               framework::dataset::make("PadX", { 1, 3, 0, 4 })),
+               framework::dataset::make("PadY", { 1, 3, 0, 4 })),
+               framework::dataset::make("KernelSize", { 3, 8, 1, 9 })),
+               framework::dataset::make("NumKernels", { 17, 3, 1, 19 })),
+               framework::dataset::make("DataType",  DataType::F16)),
+               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_fp16, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIndirectConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+               combine(combine(combine(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ),
+               framework::dataset::make("StrideX", { 1 })),
+               framework::dataset::make("StrideY", { 1 })),
+               framework::dataset::make("PadX", { 1 })),
+               framework::dataset::make("PadY", { 1 })),
+               framework::dataset::make("KernelSize", { 9 })),
+               framework::dataset::make("NumKernels", { 3 })),
+               framework::dataset::make("DataType",  DataType::F16)),
+               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::IDENTITY) )),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_fp16, tolerance_num);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIndirectConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+               combine(combine(combine(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
+               framework::dataset::make("StrideX", { 1, 3, 1, 1 })),
+               framework::dataset::make("StrideY", { 1, 3, 2, 1 })),
+               framework::dataset::make("PadX", { 1, 3, 0, 4 })),
+               framework::dataset::make("PadY", { 1, 3, 0, 4 })),
+               framework::dataset::make("KernelSize", { 3, 8, 1, 9 })),
+               framework::dataset::make("NumKernels", { 17, 3, 1, 19 })),
+               framework::dataset::make("DataType",  DataType::F32)),
+               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.0, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLIndirectConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
+               combine(combine(combine(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
+               framework::dataset::make("StrideX", { 1 })),
+               framework::dataset::make("StrideY", { 2 })),
+               framework::dataset::make("PadX", { 1 })),
+               framework::dataset::make("PadY", { 3 })),
+               framework::dataset::make("KernelSize", { 3 })),
+               framework::dataset::make("NumKernels", { 3 })),
+               framework::dataset::make("DataType",  DataType::F32)),
+               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.0, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIndirectConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+               combine(combine(combine(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ),
+               framework::dataset::make("StrideX", { 1 })),
+               framework::dataset::make("StrideY", { 1 })),
+               framework::dataset::make("PadX", { 1 })),
+               framework::dataset::make("PadY", { 1 })),
+               framework::dataset::make("KernelSize", { 9 })),
+               framework::dataset::make("NumKernels", { 3 })),
+               framework::dataset::make("DataType",  DataType::F32)),
+               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::IDENTITY) )),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.0, abs_tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // NHWC
+TEST_SUITE_END() // IndirectConvolutionLayer
+TEST_SUITE_END() // CL
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/LogLayer.cpp b/tests/validation/CL/LogLayer.cpp
index 95c4f1226e..895c306841 100644
--- a/tests/validation/CL/LogLayer.cpp
+++ b/tests/validation/CL/LogLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/MatMul.cpp b/tests/validation/CL/MatMul.cpp
new file mode 100644
index 0000000000..844597f3e9
--- /dev/null
+++ b/tests/validation/CL/MatMul.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLMatMul.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ActivationFunctionsDataset.h"
+#include "tests/framework/DatasetModes.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/TestCase.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/validation/fixtures/MatMulFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */
+constexpr float          abs_tolerance_f32(
+    0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */
+constexpr float abs_tolerance_f16(
+    0.001f);                                                    /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data type in case using relative tolerance fails because of small values */
+RelativeTolerance<half_float::half>  tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */
+constexpr AbsoluteTolerance<uint8_t> tolerance_quant(1);        /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+} // namespace
+
+template <typename T>
+using CLMatMulFixture = MatMulValidationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+template <typename T>
+using CLQuantizedMatMulFixture = QuantizedMatMulValidationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+template <typename T>
+using CLMatMulActivationFixture = MatMulValidationWithActivationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+template <typename T>
+using CLMatMulActivationAlphaBetaFixture = MatMulValidationWithActivationAlphaBetaFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+template <typename T>
+using CLQuantizedMatMulActivationFixture = QuantizedMatMulValidationWithActivationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+/* The main act functions matmul (float) is expected to support */
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.75f, 0.25f),
+});
+
+/* (Float datatype only) Larger activation functions dataset, used during some nightly tests. */
+const auto AllActivationsDataset = combine(datasets::ActivationFunctions(), framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
+
+// Alpha beta values should be integer values
+// This is for testing purposes with quantized datatypes and is not a limitation of the kernel.
+// To properly remove this restriction, dst_qinfo should be auto-initialised with consideration for alpha beta values
+// The main act functions quantized matmul kernels are expected to support
+const auto ActivationFunctionsQuantizedDataset = concat(concat(concat(
+                                                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo()),
+                                                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))),
+                                                               framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 1.f))),
+                                                        framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 2.f, 1.f)));
+
+TEST_SUITE(CL)
+TEST_SUITE(MatMul)
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulActivationFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                        framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                        framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                framework::dataset::make("DataType", DataType::F32)),
+                                                                                                        ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulActivationFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                    framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                    framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
+                                                                                                            ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunAllActivations, CLMatMulActivationAlphaBetaFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SmallerMatMulDataset(),
+                       framework::dataset::make("TransposeA", { false })),
+                       framework::dataset::make("TransposeB", { true })),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       AllActivationsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulActivationFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                       framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                       framework::dataset::make("TransposeB", { false, true })),
+                                                                                                               framework::dataset::make("DataType", DataType::F16)),
+                                                                                                       ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulActivationFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                   framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                                                           ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                                     datasets::SmallMatMulDataset(),
+                                                                                                                     framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                 framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                 ActivationFunctionsQuantizedDataset),
+                                                                                                                 framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+                                                                                                                 framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
+                                                                                                                 framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
+                                                                                                         framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
+        datasets::LargeMatMulDataset(),
+        framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                     framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                     ActivationFunctionsQuantizedDataset),
+                                                                                                                     framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+                                                                                                                     framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
+                                                                                                                     framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
+                                                                                                             framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
+        datasets::SmallMatMulDataset(),
+        framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                        framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                        ActivationFunctionsQuantizedDataset),
+                                                                                                                        framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+                                                                                                                        framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
+                                                                                                                framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
+                                                                                                        framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                                        datasets::LargeMatMulDataset(),
+                                                                                                                        framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                    framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                    ActivationFunctionsQuantizedDataset),
+                                                                                                                    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+                                                                                                                    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
+                                                                                                                    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
+                                                                                                            framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 50) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
+
+TEST_SUITE_END() // MatMul
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/MatMulKernel.cpp b/tests/validation/CL/MatMulKernel.cpp
new file mode 100644
index 0000000000..b47f8bc924
--- /dev/null
+++ b/tests/validation/CL/MatMulKernel.cpp
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/MatMulKernelFixture.h"
+#include "tests/validation/reference/Permute.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+constexpr float          abs_tolerance_f32(
+    0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for floating point data types in case using relative tolerance fails because of small values */
+constexpr float abs_tolerance_f16(
+    0.001f);                                                   /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data types in case using relative tolerance fails because of small values */
+RelativeTolerance<half_float::half> tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+} // namespace
+
+/** M0 values to test --precommit*/
+const auto m0_values_precommit = framework::dataset::make("M0", { 1, 3 });
+
+/** N0 values to test --precommit*/
+const auto n0_values_precommit = framework::dataset::make("N0", { 2, 4 });
+
+/** K0 values to test --precommit*/
+const auto k0_values_precommit = framework::dataset::make("K0", { 2, 3 });
+
+/** M0 values to test --nightly*/
+const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 1, 2, 3, 4, 5, 6, 7, 8 });
+const auto m0_values_nightly_lhs_t  = framework::dataset::make("M0", { 1, 2, 3, 4, 8 });
+
+/** N0 values to test --nightly*/
+const auto n0_values_nightly_rhs_nt = framework::dataset::make("N0", { 1, 2, 3, 4, 8, 16 });
+const auto n0_values_nightly_rhs_t  = framework::dataset::make("N0", { 1, 2, 3, 4, 8 });
+
+/** K0 values to test --nightly*/
+const auto k0_values_nightly_lhs_nt_rhs_nt = framework::dataset::make("K0", { 1, 2, 3, 4, 8, 16 });
+const auto k0_values_nightly_rhs_t         = framework::dataset::make("K0", { 1, 2, 3, 4, 8 });
+const auto k0_values_nightly_lhs_t_rhs_nt  = framework::dataset::make("K0", { 1, 2, 3, 4, 5, 6, 7, 8 });
+
+template <typename T>
+using CLMatMulKernelFixture = MatMulKernelValidationFixture<T, ClMatMulNativeKernel>;
+
+template <typename T>
+using CLMatMulKernelBiasFixture = MatMulKernelWithBiasValidation<T, ClMatMulNativeKernel>;
+
+TEST_SUITE(CL)
+TEST_SUITE(MatMulKernel)
+TEST_SUITE(Validate)
+
+TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL)
+{
+    using MatMulConfigurationPair = std::pair<MatMulKernelInfo, bool>;
+
+    const std::vector<MatMulConfigurationPair> supported_block_sizes =
+    {
+        // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false)
+        // Lhs not-transposed, Rhs-not-transposed
+        { MatMulKernelInfo(false, false, 0, 1, 1), false },  // M0 should be > 0
+        { MatMulKernelInfo(false, false, 3, 5, 1), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 3, 6, 1), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 3, 3, 17), false }, // K0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 3, 3, 7), false },  // K0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 9, 1, 2), true },
+        { MatMulKernelInfo(false, false, 3, 16, 3), true },
+        { MatMulKernelInfo(false, false, 7, 3, 4), true },
+        { MatMulKernelInfo(false, false, 7, 3, 4, true), false },  // N0 not in {4, 8, 16}
+        { MatMulKernelInfo(false, false, 7, 1, 4, true), false },  // N0 not in {4, 8, 16}
+        { MatMulKernelInfo(false, false, 7, 12, 4, true), false }, // N0 not in {4, 8, 16}
+        { MatMulKernelInfo(false, false, 7, 4, 4, true), true },
+        { MatMulKernelInfo(false, false, 7, 8, 4, true), true },
+        { MatMulKernelInfo(false, false, 7, 16, 4, true), true },
+
+        // Lhs not-transposed, Rhs transposed
+        { MatMulKernelInfo(false, true, 0, 1, 1), false },  // M0 should be > 0
+        { MatMulKernelInfo(false, true, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, true, 3, 7, 1), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, true, 3, 3, 12), false }, // K0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, true, 3, 3, 6), false },  // K0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, true, 5, 1, 2), true },
+        { MatMulKernelInfo(false, true, 3, 3, 3), true },
+        { MatMulKernelInfo(false, true, 2, 4, 8), true },
+        { MatMulKernelInfo(false, true, 2, 4, 5, true), false }, // K0 not in {4, 8, 16}
+        { MatMulKernelInfo(false, true, 2, 4, 9, true), false }, // K0 not in {4, 8, 16}
+        { MatMulKernelInfo(false, true, 2, 4, 3, true), false }, // K0 not in {4, 8, 16}
+        { MatMulKernelInfo(false, true, 2, 4, 4, true), true },
+        { MatMulKernelInfo(false, true, 2, 4, 8, true), true },
+        { MatMulKernelInfo(false, true, 2, 8, 16, true), true },
+
+        // Lhs transposed, Rhs-not-transposed
+        { MatMulKernelInfo(true, false, 1, 1, 0), false },  // K0 should be > 0
+        { MatMulKernelInfo(true, false, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(true, false, 3, 7, 1), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(true, false, 6, 3, 12), false }, // M0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(true, false, 5, 3, 6), false },  // M0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(true, false, 4, 1, 22), true },
+        { MatMulKernelInfo(true, false, 3, 3, 3), true },
+        { MatMulKernelInfo(true, false, 2, 4, 8), true },
+        { MatMulKernelInfo(true, false, 2, 3, 8, true), false }, // N0 not in {4, 8, 16}
+        { MatMulKernelInfo(true, false, 2, 7, 8, true), false }, // N0 not in {4, 8, 16}
+        { MatMulKernelInfo(true, false, 2, 5, 8, true), false }, // N0 not in {4, 8, 16}
+        { MatMulKernelInfo(true, false, 2, 4, 8, true), true },
+        { MatMulKernelInfo(true, false, 2, 8, 8, true), true },
+        { MatMulKernelInfo(true, false, 2, 16, 8, true), true },
+
+        // Lhs transposed, Rhs-transposed
+        { MatMulKernelInfo(true, true, 2, 1, 5), false },  // K0 should in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(true, true, 1, 8, 7), false },  // K0 should in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(true, true, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(true, true, 3, 7, 1), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(true, true, 6, 3, 12), false }, // M0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(true, true, 5, 3, 6), false },  // M0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(true, true, 4, 8, 16), true },
+        { MatMulKernelInfo(true, true, 3, 3, 4), true },
+        { MatMulKernelInfo(true, true, 16, 4, 8), true },
+        { MatMulKernelInfo(true, true, 2, 2, 1, true), false }, // K0 not in {4, 8, 16}
+        { MatMulKernelInfo(true, true, 2, 2, 5, true), false }, // K0 not in {4, 8, 16}
+        { MatMulKernelInfo(true, true, 2, 4, 7, true), false }, // K0 not in {4, 8, 16}
+        { MatMulKernelInfo(true, true, 2, 4, 4, true), true },
+        { MatMulKernelInfo(true, true, 2, 8, 8, true), true },
+        { MatMulKernelInfo(true, true, 2, 8, 16, true), true },
+    };
+
+    // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal
+    // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here,
+    // not the shapes themselves.
+    const TensorInfo lhs_info = TensorInfo(TensorShape(100U, 100U), 1, DataType::F32);
+    const TensorInfo rhs_info = TensorInfo(TensorShape(100U, 100U), 1, DataType::F32);
+
+    const bool export_to_cl_image_supported = image2d_from_buffer_supported(CLKernelLibrary::get().get_device());
+    for(auto &pair : supported_block_sizes)
+    {
+        TensorInfo output_info;
+        Status     status = ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &output_info, pair.first);
+
+        if(!pair.first.export_rhs_to_cl_image || export_to_cl_image_supported)
+        {
+            ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS);
+        }
+    }
+}
+
+TEST_CASE(ExportToCLImage, framework::DatasetMode::ALL)
+{
+    // We skip this test if the hardware does not support exporting to CL Image
+    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    {
+        constexpr size_t pixel_size  = 4;
+        const size_t     max_image_w = pixel_size * CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+        const size_t     max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+        using ShapeConfigurationTuple = std::tuple<TensorShape, TensorShape, bool, bool, bool>;
+        const std::vector<ShapeConfigurationTuple> shape_configurations =
+        {
+            // lhs_shape, rhs_shape, adj_lhs, adj_rhs, expected
+            // Lhs t/Nt, Rhs Nt
+            // Transposition of Lhs doesn't add any value to the tests, therefore always assumed false below
+            { TensorShape(5U, 1U), TensorShape(3U, 5U), false, false, false },  // N should be multiple of 4
+            { TensorShape(5U, 1U), TensorShape(14U, 5U), false, false, false }, // N should be multiple of 4
+            { TensorShape(5U, 1U), TensorShape(12U, 5U), false, false, true },
+            { TensorShape(5U, 1U), TensorShape(8U, 5U), false, false, true },
+            { TensorShape(5U, 1U), TensorShape(4U, 5U), false, false, true },
+            { TensorShape(max_image_h + 1, 1U), TensorShape(4U, max_image_h + 1), false, false, false }, // Cannot fit into CL Image memory's height
+            { TensorShape(5U, 1U), TensorShape(max_image_w + 1, 5U), false, false, false },              // Cannot fit into CL Image memory's width
+            { TensorShape(max_image_h, 1U), TensorShape(4U, max_image_h), false, false, true },          // Barely fits into CL Image memory's height
+            { TensorShape(5U, 1U), TensorShape(max_image_w, 5U), false, false, true },                   // Barely fits into CL Image memory's width
+
+            // Lhs Nt/T , Rhs T
+            { TensorShape(5U, 1U), TensorShape(5U, 3U), false, true, false },  // K should be multiple of 4
+            { TensorShape(5U, 1U), TensorShape(5U, 14U), false, true, false }, // K should be multiple of 4
+            { TensorShape(4U, 1U), TensorShape(4U, 10U), false, true, true },
+            { TensorShape(8U, 1U), TensorShape(8U, 9U), false, true, true },
+            { TensorShape(12U, 1U), TensorShape(12U, 6U), false, true, true },
+        };
+
+        for(auto &tuple : shape_configurations)
+        {
+            TensorShape lhs_shape = std::get<0>(tuple);
+            TensorShape rhs_shape = std::get<1>(tuple);
+
+            const TensorInfo lhs_info = TensorInfo(lhs_shape, 1, DataType::F32);
+            const TensorInfo rhs_info = TensorInfo(rhs_shape, 1, DataType::F32);
+
+            const bool adj_lhs = std::get<2>(tuple);
+            const bool adj_rhs = std::get<3>(tuple);
+
+            // We choose M0, N0, K0 equal to 4 so that they're always valid for CLImage in any combination
+            const MatMulKernelInfo matmul_kernel_info
+            {
+                adj_lhs, adj_rhs, 4, 4, 4, true /* export_rhs_to_cl_image */
+            };
+
+            TensorInfo output_info;
+            Status     status = ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &output_info, matmul_kernel_info);
+
+            const bool expected = std::get<4>(tuple);
+            ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+        }
+    }
+}
+
+TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
+{
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using ShapeConfigurationTuple = std::tuple<TensorShape, TensorShape, TensorShape, bool>;
+    const std::vector<ShapeConfigurationTuple> shape_configurations =
+    {
+        { TensorShape(5U, 1U), TensorShape(3U, 5U), TensorShape(3U), true },
+        { TensorShape(10U, 12U), TensorShape(3U, 10U), TensorShape(3U), true },
+        { TensorShape(8U, 4U), TensorShape(2U, 8U), TensorShape(2U), true },
+        { TensorShape(8U, 4U), TensorShape(2U, 5U), TensorShape(2U), false }, // Mismatch in the K dimension
+        { TensorShape(5U, 0U), TensorShape(2U, 5U), TensorShape(2U), false }, // Invalid dimension
+        { TensorShape(5U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), true },
+        { TensorShape(5U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // no batch broadcasting
+        { TensorShape(5U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // mismatch in batch dimension
+        { TensorShape(5U, 1U), TensorShape(3U, 5U), TensorShape(1U), false },                                 // Unsupported bias broadcasting.
+        { TensorShape(5U, 1U), TensorShape(3U, 5U), TensorShape(3U, 3U), false },                             // 2D bias is unsupported.
+        { TensorShape(5U, 1U), TensorShape(3U, 5U), TensorShape(6U), false },                                 // bias first dimension != dst first dimension
+    };
+
+    for(auto &tuple : shape_configurations)
+    {
+        const bool expected = std::get<3>(tuple);
+
+        for(bool adj_lhs :
+            {
+                false, true
+            })
+        {
+            for(bool adj_rhs :
+                {
+                    false, true
+                })
+            {
+                TensorShape lhs_shape = std::get<0>(tuple);
+                TensorShape rhs_shape = std::get<1>(tuple);
+                TensorShape bia_shape = std::get<2>(tuple);
+
+                if(adj_lhs)
+                {
+                    permute(lhs_shape, PermutationVector(1U, 0U));
+                }
+
+                if(adj_rhs)
+                {
+                    permute(rhs_shape, PermutationVector(1U, 0U));
+                }
+
+                const TensorInfo lhs_info = TensorInfo(lhs_shape, 1, DataType::F32);
+                const TensorInfo rhs_info = TensorInfo(rhs_shape, 1, DataType::F32);
+                const TensorInfo bia_info = TensorInfo(bia_shape, 1, DataType::F32);
+                TensorInfo       output_info;
+
+                MatMulKernelInfo matmul_kernel_info{ adj_lhs, adj_rhs, 1, 1, 1, false /* export_rhs_to_cl_image */ };
+
+                Status status = ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
+                ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+            }
+        }
+    }
+}
+
+TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
+{
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using DataTypeConfigurationTuple = std::tuple<DataType, DataType, DataType, bool>;
+    const std::vector<DataTypeConfigurationTuple> data_type_configurations =
+    {
+        { DataType::F32, DataType::F32, DataType::F32, true },
+        { DataType::F16, DataType::F16, DataType::F16, true },
+        { DataType::F16, DataType::F32, DataType::F32, false },                                              // no mixed precision
+        { DataType::F64, DataType::F64, DataType::F64, false },                                              // no double precision
+        { DataType::QASYMM8, DataType::QASYMM8, DataType::QASYMM8, false },                                  // no quantized types
+        { DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, false },             // no quantized types
+        { DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, false }, // no quantized types
+        { DataType::QASYMM16, DataType::QASYMM16, DataType::QASYMM16, false },                               // no quantized types
+        { DataType::QSYMM16, DataType::QSYMM16, DataType::QSYMM16, false },                                  // no quantized types
+        { DataType::QSYMM8, DataType::QSYMM8, DataType::QSYMM8, false },                                     // no quantized types
+        { DataType::S64, DataType::S64, DataType::S64, false },                                              // no integral types
+        { DataType::S32, DataType::S32, DataType::S32, false },                                              // no integral types
+        { DataType::S16, DataType::S16, DataType::S16, false },                                              // no integral types
+        { DataType::S8, DataType::S8, DataType::S8, false },                                                 // no integral types
+        { DataType::U64, DataType::U64, DataType::U64, false },                                              // no integral types
+        { DataType::U32, DataType::U32, DataType::U32, false },                                              // no integral types
+        { DataType::U16, DataType::U16, DataType::U16, false },                                              // no integral types
+        { DataType::U8, DataType::U8, DataType::U8, false },                                                 // no integral types
+    };
+
+    const TensorShape      shape = TensorShape(10U, 10U);
+    const MatMulKernelInfo matmul_kernel_info{ false, false, 1, 1, 1, false };
+    for(auto &tuple : data_type_configurations)
+    {
+        const bool expected = std::get<3>(tuple);
+
+        const TensorInfo lhs_info(shape, 1, std::get<0>(tuple));
+        const TensorInfo rhs_info(shape, 1, std::get<1>(tuple));
+        TensorInfo       output_info(shape, 1, std::get<2>(tuple));
+
+        Status status = ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &output_info, matmul_kernel_info);
+        ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_SUITE_END() // Validate
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+TEST_SUITE(Buffer)
+FIXTURE_DATA_TEST_CASE(RunTiny, CLMatMulKernelFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::TinyMatMulDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                   framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                   m0_values_precommit),
+                                                                                                                   n0_values_precommit),
+                                                                                                                   k0_values_precommit),
+                                                                                                           framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                   framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulKernelFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                    framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                    framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                    m0_values_precommit),
+                                                                                                                    n0_values_precommit),
+                                                                                                                    k0_values_precommit),
+                                                                                                            framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunWithBias, CLMatMulKernelBiasFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                   framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                   m0_values_precommit),
+                                                                                                                   n0_values_precommit),
+                                                                                                                   k0_values_precommit),
+                                                                                                                   framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                           framework::dataset::make("DataType", DataType::F32)))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeNoTranspose, CLMatMulKernelFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false })),
+                                                                                                                   framework::dataset::make("TransposeB", { false })),
+                                                                                                                   m0_values_nightly_lhs_nt),
+                                                                                                                   n0_values_nightly_rhs_nt),
+                                                                                                                   k0_values_nightly_lhs_nt_rhs_nt),
+                                                                                                                   framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                                   framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, CLMatMulKernelFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                     framework::dataset::make("TransposeA", { false })),
+                                                                                                                     framework::dataset::make("TransposeB", { true })),
+                                                                                                                     m0_values_nightly_lhs_nt),
+                                                                                                                     n0_values_nightly_rhs_t),
+                                                                                                                     k0_values_nightly_rhs_t),
+                                                                                                                     framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                                     framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposed, CLMatMulKernelFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                     framework::dataset::make("TransposeA", { true })),
+                                                                                                                     framework::dataset::make("TransposeB", { false })),
+                                                                                                                     m0_values_nightly_lhs_t),
+                                                                                                                     n0_values_nightly_rhs_nt),
+                                                                                                                     k0_values_nightly_lhs_t_rhs_nt),
+                                                                                                                     framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                                     framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposedRhsTransposed, CLMatMulKernelFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { true })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_t),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_values_nightly_rhs_t),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+// Running High Dimensional test is enough for FP32, because we're stressing the number of dimensions, not data type or M0/N0/K0
+// It's a good idea to test for each Lhs/Rhs T/NT combinations because they're different CL kernels
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, CLMatMulKernelFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::HighDimensionalMatMulDataset(),
+                                                                                                                      framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                      framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                      framework::dataset::make("M0", { 2 })),
+                                                                                                                      framework::dataset::make("N0", { 2 })),
+                                                                                                                      framework::dataset::make("K0", { 2 })),
+                                                                                                                      framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                              framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+TEST_SUITE_END() // Buffer
+
+TEST_SUITE(ExportRhsToCLImage)
+FIXTURE_DATA_TEST_CASE(RunSmallRhsNotTransposed, CLMatMulKernelFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDatasetRhsExportToCLImageRhsNT(),
+                                                                               framework::dataset::make("TransposeA", { true, false })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               framework::dataset::make("M0", { 2 })),
+                                                       framework::dataset::make("N0", { 4, 8, 16 })),
+                                               framework::dataset::make("K0", { 2, 4 })),
+                                       framework::dataset::make("ExportRhsToCLImage", { true })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_export_to_cl_image)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeRhsNotTransposed, CLMatMulKernelFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDatasetRhsExportToCLImageRhsNT(),
+                                                                               framework::dataset::make("TransposeA", { true, false })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               framework::dataset::make("M0", { 2 })), // Choices of M0 does not matter much because it's related to Lhs tensor
+                                                       framework::dataset::make("N0", { 4, 8, 16 })),
+                                               framework::dataset::make("K0", { 1, 2, 3, 4 })),
+                                       framework::dataset::make("ExportRhsToCLImage", { true })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_export_to_cl_image)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunSmallRhsTransposed, CLMatMulKernelFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDatasetRhsExportToCLImageRhsT(),
+                                                                               framework::dataset::make("TransposeA", { true, false })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               framework::dataset::make("M0", { 2 })),
+                                                       framework::dataset::make("N0", { 2, 4 })),
+                                               framework::dataset::make("K0", { 4, 8, 16 })),
+                                       framework::dataset::make("ExportRhsToCLImage", { true })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_export_to_cl_image)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, CLMatMulKernelFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDatasetRhsExportToCLImageRhsT(),
+                                                                               framework::dataset::make("TransposeA", { true, false })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               framework::dataset::make("M0", { 2 })), // Choices of M0 does not matter much because it's related to Lhs tensor
+                                                       framework::dataset::make("N0", { 1, 2, 3, 4 })),
+                                               framework::dataset::make("K0", { 4, 8, 16 })),
+                                       framework::dataset::make("ExportRhsToCLImage", { true })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_export_to_cl_image)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+TEST_SUITE_END() // ExportRhsToCLImage
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+TEST_SUITE(Buffer)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulKernelFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                   framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                   m0_values_precommit),
+                                                                                                                   n0_values_precommit),
+                                                                                                                   k0_values_precommit),
+                                                                                                           framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                   framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeNoTranspose, CLMatMulKernelFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                  framework::dataset::make("TransposeA", { false })),
+                                                                                                                  framework::dataset::make("TransposeB", { false })),
+                                                                                                                  m0_values_nightly_lhs_nt),
+                                                                                                                  n0_values_nightly_rhs_nt),
+                                                                                                                  k0_values_nightly_lhs_nt_rhs_nt),
+                                                                                                                  framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                                  framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, CLMatMulKernelFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                    framework::dataset::make("TransposeA", { false })),
+                                                                                                                    framework::dataset::make("TransposeB", { true })),
+                                                                                                                    m0_values_nightly_lhs_nt),
+                                                                                                                    n0_values_nightly_rhs_t),
+                                                                                                                    k0_values_nightly_rhs_t),
+                                                                                                                    framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposed, CLMatMulKernelFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                    framework::dataset::make("TransposeA", { true })),
+                                                                                                                    framework::dataset::make("TransposeB", { false })),
+                                                                                                                    m0_values_nightly_lhs_t),
+                                                                                                                    n0_values_nightly_rhs_nt),
+                                                                                                                    k0_values_nightly_lhs_t_rhs_nt),
+                                                                                                                    framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposedRhsTransposed, CLMatMulKernelFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { true })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_t),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_values_nightly_rhs_t),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+TEST_SUITE_END() // Buffer
+
+TEST_SUITE(ExportRhsToCLImage)
+FIXTURE_DATA_TEST_CASE(RunSmallRhsNotTransposed, CLMatMulKernelFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDatasetRhsExportToCLImageRhsNT(),
+                                                                               framework::dataset::make("TransposeA", { true, false })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               framework::dataset::make("M0", { 2 })),
+                                                       framework::dataset::make("N0", { 4, 8, 16 })),
+                                               framework::dataset::make("K0", { 2, 4 })),
+                                       framework::dataset::make("ExportRhsToCLImage", { true })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    if(_device_supports_export_to_cl_image)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeRhsNotTransposed, CLMatMulKernelFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDatasetRhsExportToCLImageRhsNT(),
+                                                                               framework::dataset::make("TransposeA", { true, false })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               framework::dataset::make("M0", { 2 })), // Choices of M0 does not matter much because it's related to Lhs tensor
+                                                       framework::dataset::make("N0", { 4, 8, 16 })),
+                                               framework::dataset::make("K0", { 1, 2, 3, 4 })),
+                                       framework::dataset::make("ExportRhsToCLImage", { true })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    if(_device_supports_export_to_cl_image)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunSmallRhsTransposed, CLMatMulKernelFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDatasetRhsExportToCLImageRhsT(),
+                                                                               framework::dataset::make("TransposeA", { true, false })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               framework::dataset::make("M0", { 2 })),
+                                                       framework::dataset::make("N0", { 2, 4 })),
+                                               framework::dataset::make("K0", { 4, 8, 16 })),
+                                       framework::dataset::make("ExportRhsToCLImage", { true })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    if(_device_supports_export_to_cl_image)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, CLMatMulKernelFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDatasetRhsExportToCLImageRhsT(),
+                                                                               framework::dataset::make("TransposeA", { true, false })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               framework::dataset::make("M0", { 2 })), // Choices of M0 does not matter much because it's related to Lhs tensor
+                                                       framework::dataset::make("N0", { 1, 2, 3, 4 })),
+                                               framework::dataset::make("K0", { 4, 8, 16 })),
+                                       framework::dataset::make("ExportRhsToCLImage", { true })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    if(_device_supports_export_to_cl_image)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+}
+TEST_SUITE_END() // ExportRhsToCLImage
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // MatMulKernel
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/MatMulLowpNativeKernel.cpp b/tests/validation/CL/MatMulLowpNativeKernel.cpp
new file mode 100644
index 0000000000..90eee4fb82
--- /dev/null
+++ b/tests/validation/CL/MatMulLowpNativeKernel.cpp
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/MatMulKernelFixture.h"
+#include "tests/validation/reference/Permute.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+}
+template <typename T>
+using CLMatMulLowpNativeKernelFixture = MatMulKernelValidationFixture<T, ClMatMulLowpNativeKernel>;
+
+template <typename T>
+using CLMatMulLowpKernelWithBiasFixture = MatMulKernelWithBiasValidation<T, ClMatMulLowpNativeKernel>;
+
+/** M0 values to test --precommit*/
+const auto m0_values_precommit = framework::dataset::make("M0", { 1, 3 });
+
+/** N0 values to test --precommit*/
+const auto n0_values_precommit = framework::dataset::make("N0", { 2, 4 });
+
+/** K0 values to test --precommit*/
+const auto k0_values_precommit = framework::dataset::make("K0", { 2, 3 });
+
+/** M0 values to test --nightly*/
+const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 1, 2, 3, 4, 5, 6, 7, 8 });
+const auto m0_values_nightly_lhs_t  = framework::dataset::make("M0", { 1, 2, 3, 4, 8 });
+
+/** N0 values to test --nightly*/
+const auto n0_values_nightly_rhs_nt = framework::dataset::make("N0", { 1, 2, 3, 4, 8, 16 });
+const auto n0_values_nightly_rhs_t  = framework::dataset::make("N0", { 1, 2, 3, 4, 8 });
+
+/** K0 values to test --nightly*/
+const auto k0_values_nightly_lhs_nt_rhs_nt = framework::dataset::make("K0", { 1, 2, 3, 4, 8, 16 });
+const auto k0_values_nightly_rhs_t         = framework::dataset::make("K0", { 1, 2, 3, 4, 8 });
+const auto k0_values_nightly_lhs_t_rhs_nt  = framework::dataset::make("K0", { 1, 2, 3, 4, 5, 6, 7, 8 });
+
+TEST_SUITE(CL)
+TEST_SUITE(MatMulLowpNativeKernel)
+TEST_SUITE(Validate)
+
+TEST_CASE(SupportedKernelConfigurations, framework::DatasetMode::ALL)
+{
+    using MatMulConfigurationPair = std::pair<MatMulKernelInfo, bool>;
+
+    const std::vector<MatMulConfigurationPair> supported_block_sizes =
+    {
+        // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false)
+        // Lhs not-transposed, Rhs-not-transposed
+        { MatMulKernelInfo(false, false, 0, 1, 1), false },  // M0 should be > 0
+        { MatMulKernelInfo(false, false, 3, 5, 1), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 3, 6, 1), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 3, 3, 17), false }, // K0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 3, 3, 7), false },  // K0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 9, 1, 2), true },
+        { MatMulKernelInfo(false, false, 3, 16, 3), true },
+        { MatMulKernelInfo(false, false, 7, 3, 4), true },
+        { MatMulKernelInfo(false, false, 7, 3, 4, true), true }, // export to CLImage is unsupported for quantized types
+    };
+
+    // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal
+    // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here,
+    // not the shapes themselves.
+    const TensorInfo lhs_info = TensorInfo(TensorShape(100U, 100U), 1, DataType::QASYMM8_SIGNED);
+    const TensorInfo rhs_info = TensorInfo(TensorShape(100U, 100U), 1, DataType::QASYMM8_SIGNED);
+
+    for(auto &pair : supported_block_sizes)
+    {
+        TensorInfo output_info;
+        Status     status = ClMatMulLowpNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &output_info, pair.first);
+
+        ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
+{
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using ShapeConfigurationTuple = std::tuple<TensorShape, TensorShape, TensorShape, bool>;
+    const std::vector<ShapeConfigurationTuple> shape_configurations =
+    {
+        { TensorShape(5U, 1U), TensorShape(3U, 5U), TensorShape(3U), true },
+        { TensorShape(10U, 12U), TensorShape(3U, 10U), TensorShape(3U), true },
+        { TensorShape(8U, 4U), TensorShape(2U, 8U), TensorShape(2U), true },
+        { TensorShape(8U, 4U), TensorShape(2U, 5U), TensorShape(2U), false }, // Mismatch in the K dimension
+        { TensorShape(5U, 0U), TensorShape(2U, 5U), TensorShape(2U), false }, // Invalid dimension
+        { TensorShape(5U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), true },
+        { TensorShape(5U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // no batch broadcasting
+        { TensorShape(5U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // mismatch in batch dimension
+        { TensorShape(5U, 1U), TensorShape(3U, 5U), TensorShape(1U), false },                                 // invalid broadcast of bias
+        { TensorShape(5U, 1U), TensorShape(3U, 5U), TensorShape(3U, 3U), false },                             // 2d bias is invalid
+    };
+
+    for(auto &tuple : shape_configurations)
+    {
+        const bool expected = std::get<3>(tuple);
+
+        for(bool adj_lhs :
+            {
+                false, true
+            })
+        {
+            for(bool adj_rhs :
+                {
+                    false, true
+                })
+            {
+                TensorShape lhs_shape = std::get<0>(tuple);
+                TensorShape rhs_shape = std::get<1>(tuple);
+                TensorShape bia_shape = std::get<2>(tuple);
+
+                if(adj_lhs)
+                {
+                    permute(lhs_shape, PermutationVector(1U, 0U));
+                }
+
+                if(adj_rhs)
+                {
+                    permute(rhs_shape, PermutationVector(1U, 0U));
+                }
+
+                const TensorInfo lhs_info = TensorInfo(lhs_shape, 1, DataType::QASYMM8_SIGNED);
+                const TensorInfo rhs_info = TensorInfo(rhs_shape, 1, DataType::QASYMM8_SIGNED);
+                const TensorInfo bia_info = TensorInfo(bia_shape, 1, DataType::S32);
+                TensorInfo       output_info;
+
+                MatMulKernelInfo matmul_kernel_info{ adj_lhs, adj_rhs, 1, 1, 1, false /* export_rhs_to_cl_image */ };
+
+                Status status = ClMatMulLowpNativeKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
+                ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+            }
+        }
+    }
+}
+
+TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
+{
+    using DataTypeConfigurationTuple = std::tuple<DataType, DataType, DataType, DataType, bool>;
+    const std::vector<DataTypeConfigurationTuple> data_type_configurations =
+    {
+        { DataType::F32, DataType::F32, DataType::F32, DataType::F32, false }, // no floating point types
+        { DataType::F16, DataType::F16, DataType::F16, DataType::F16, false }, // no floating point types
+        { DataType::F64, DataType::F64, DataType::F64, DataType::F64, false }, // no double precision
+        { DataType::QASYMM8, DataType::QASYMM8, DataType::S32, DataType::QASYMM8, true },
+        { DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::S32, DataType::QASYMM8_SIGNED, true },
+        { DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::S32, DataType::QSYMM8_PER_CHANNEL, false }, // only qasymm8/qasymm8_signed is supported
+        { DataType::QASYMM16, DataType::QASYMM16, DataType::S32, DataType::QASYMM16, false },                               // only qasymm8/qasymm8_signed is supported
+        { DataType::QSYMM16, DataType::QSYMM16, DataType::S32, DataType::QSYMM16, false },                                  // only qasymm8/qasymm8_signed is supported
+        { DataType::QSYMM8, DataType::QSYMM8, DataType::S32, DataType::QSYMM8, false },                                     // only qasymm8/qasymm8_signed is supported
+        { DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::QASYMM8, false },                           // no mixed data types
+        { DataType::S64, DataType::S64, DataType::S64, DataType::S64, false },                                              // no integral types
+        { DataType::S32, DataType::S32, DataType::S32, DataType::S32, false },                                              // no integral types
+        { DataType::S16, DataType::S16, DataType::S16, DataType::S16, false },                                              // no integral types
+        { DataType::S8, DataType::S8, DataType::S8, DataType::S8, false },                                                  // no integral types
+        { DataType::U64, DataType::U64, DataType::U64, DataType::U64, false },                                              // no integral types
+        { DataType::U32, DataType::U32, DataType::U32, DataType::U32, false },                                              // no integral types
+        { DataType::U16, DataType::U16, DataType::U16, DataType::U16, false },                                              // no integral types
+        { DataType::U8, DataType::U8, DataType::U8, DataType::U8, false },                                                  // no integral types
+        { DataType::QASYMM8, DataType::QASYMM8, DataType::F32, DataType::QASYMM8, false }                                   // Only S32 bias is supported
+    };
+
+    // It's enough to test a single shape and block size configuration while checking data types
+    const TensorShape      shape     = TensorShape(10U, 10U);
+    const TensorShape      bia_shape = TensorShape(10U);
+    const MatMulKernelInfo matmul_kernel_info{ false, false, 1, 1, 1, false };
+    for(auto &tuple : data_type_configurations)
+    {
+        const bool expected = std::get<4>(tuple);
+
+        const TensorInfo lhs_info(shape, 1, std::get<0>(tuple));
+        const TensorInfo rhs_info(shape, 1, std::get<1>(tuple));
+        const TensorInfo bia_info(bia_shape, 1, std::get<2>(tuple));
+        TensorInfo       output_info(shape, 1, std::get<3>(tuple));
+
+        Status status = ClMatMulLowpNativeKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
+        ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_SUITE_END() // Validate
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunTiny, CLMatMulLowpNativeKernelFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::TinyMatMulDataset(),
+                                                                                                                      framework::dataset::make("TransposeA", { true, false })),
+                                                                                                                      framework::dataset::make("TransposeB", { true, false })),
+                                                                                                                      m0_values_precommit),
+                                                                                                                      n0_values_precommit),
+                                                                                                                      k0_values_precommit),
+                                                                                                                      framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                              framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulLowpNativeKernelFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                       framework::dataset::make("TransposeA", { true, false })),
+                                                                                                                       framework::dataset::make("TransposeB", { true, false })),
+                                                                                                                       m0_values_precommit),
+                                                                                                                       n0_values_precommit),
+                                                                                                                       k0_values_precommit),
+                                                                                                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunWithBias, CLMatMulLowpKernelWithBiasFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                    framework::dataset::make("TransposeA", { true, false })),
+                                                                                                                    framework::dataset::make("TransposeB", { true, false })),
+                                                                                                                    m0_values_precommit),
+                                                                                                                    n0_values_precommit),
+                                                                                                                    k0_values_precommit),
+                                                                                                                    framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeNoTranspose, CLMatMulLowpNativeKernelFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_nt),
+                                               k0_values_nightly_lhs_nt_rhs_nt),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, CLMatMulLowpNativeKernelFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_values_nightly_rhs_t),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposed, CLMatMulLowpNativeKernelFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { true })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               m0_values_nightly_lhs_t),
+                                                       n0_values_nightly_rhs_nt),
+                                               k0_values_nightly_lhs_t_rhs_nt),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposedRhsTransposed, CLMatMulLowpNativeKernelFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { true })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_t),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_values_nightly_rhs_t),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+// Running High Dimensional test is enough for qasymm8_signed, because we're stressing the number of dimensions, not data type or M0/N0/K0
+// It's a good idea to test for each Lhs/Rhs T/NT combinations because they're different CL kernels
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, CLMatMulLowpNativeKernelFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::HighDimensionalMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { true, false })),
+                                                                       framework::dataset::make("TransposeB", { true, false })),
+                                                               framework::dataset::make("M0", { 2 })),
+                                                       framework::dataset::make("N0", { 2 })),
+                                               framework::dataset::make("K0", { 2 })),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunTiny, CLMatMulLowpNativeKernelFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::TinyMatMulDataset(),
+                                                                                                                       framework::dataset::make("TransposeA", { true, false })),
+                                                                                                                       framework::dataset::make("TransposeB", { true, false })),
+                                                                                                                       m0_values_precommit),
+                                                                                                                       n0_values_precommit),
+                                                                                                                       k0_values_precommit),
+                                                                                                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulLowpNativeKernelFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                        framework::dataset::make("TransposeA", { true, false })),
+                                                                                                                        framework::dataset::make("TransposeB", { true, false })),
+                                                                                                                        m0_values_precommit),
+                                                                                                                        n0_values_precommit),
+                                                                                                                        k0_values_precommit),
+                                                                                                                        framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                                framework::dataset::make("DataType", DataType::QASYMM8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeNoTranspose, CLMatMulLowpNativeKernelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_nt),
+                                               k0_values_nightly_lhs_nt_rhs_nt),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::QASYMM8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, CLMatMulLowpNativeKernelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_values_nightly_rhs_t),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::QASYMM8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposed, CLMatMulLowpNativeKernelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { true })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               m0_values_nightly_lhs_t),
+                                                       n0_values_nightly_rhs_nt),
+                                               k0_values_nightly_lhs_t_rhs_nt),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::QASYMM8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposedRhsTransposed, CLMatMulLowpNativeKernelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                               framework::dataset::make("TransposeA", { true })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_t),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_values_nightly_rhs_t),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::QASYMM8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // MatMulLowpNativeKernel
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/MatMulLowpNativeMMULKernel.cpp b/tests/validation/CL/MatMulLowpNativeMMULKernel.cpp
new file mode 100644
index 0000000000..ac46b67c9e
--- /dev/null
+++ b/tests/validation/CL/MatMulLowpNativeMMULKernel.cpp
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h"
+
+#include "tests/datasets/MatMulLowpMMULDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/MatMulKernelFixture.h"
+#include "tests/validation/reference/Permute.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+}
+using framework::dataset::make;
+
+template <typename T>
+using CLMatMulLowpNativeMMULKernelFixture = MatMulKernelValidationFixture<T, ClMatMulLowpNativeMMULKernel, true /* use_mmul */>;
+
+template <typename T>
+using CLMatMulLowpNativeMMULKernelWithBiasFixture = MatMulKernelWithBiasValidation<T, ClMatMulLowpNativeMMULKernel, true /* use_mmul */>;
+
+/** M0 values to test --precommit*/
+const auto m0_values_precommit = framework::dataset::make("M0", { 1, 3 });
+
+/** N0 values to test --precommit*/
+const auto n0_values_precommit = framework::dataset::make("N0", { 2, 4 });
+
+/** M0 values to test --nightly*/
+const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 2, 4, 5, 8 });
+const auto m0_values_nightly_lhs_t  = framework::dataset::make("M0", { 2, 4, 8 });
+
+/** N0 values to test --nightly*/
+const auto n0_values_nightly = framework::dataset::make("N0", { 1, 3, 8, 16 });
+
+TEST_SUITE(CL)
+TEST_SUITE(MatMulLowpNativeMMULKernel)
+TEST_SUITE(Validate)
+
+TEST_CASE(SupportedKernelConfigurations, framework::DatasetMode::ALL)
+{
+    using MatMulConfigurationPair = std::pair<MatMulKernelInfo, bool>;
+
+    const std::vector<MatMulConfigurationPair> supported_block_sizes =
+    {
+        // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false)
+        { MatMulKernelInfo(false, false, 0, 1, 4), false }, // M0 should be > 0
+        { MatMulKernelInfo(false, true, 3, 5, 4), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 3, 6, 4), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+        { MatMulKernelInfo(false, false, 3, 3, 8), false }, // K0 not in 4
+        { MatMulKernelInfo(true, false, 5, 3, 4), false },  // M0 not in {1, 2, 3, 4, 8, 16} when Lhs is transposed
+        { MatMulKernelInfo(false, false, 9, 1, 4), true },
+        { MatMulKernelInfo(false, true, 3, 16, 4), true },
+        { MatMulKernelInfo(false, false, 7, 3, 4), true },
+        { MatMulKernelInfo(true, false, 8, 3, 4), true },
+        { MatMulKernelInfo(true, true, 4, 3, 4), true },
+        { MatMulKernelInfo(false, false, 7, 3, 4, true), false }, // export to CLImage is unsupported for quantized types
+    };
+
+    // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal
+    // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here,
+    // not the shapes themselves.
+    const TensorInfo lhs_info = TensorInfo(TensorShape(64U, 64U), 1, DataType::QASYMM8_SIGNED);
+    const TensorInfo rhs_info = TensorInfo(TensorShape(64U, 64U), 1, DataType::QASYMM8_SIGNED);
+
+    for(auto &pair : supported_block_sizes)
+    {
+        TensorInfo output_info;
+        Status     status   = ClMatMulLowpNativeMMULKernel::validate(&lhs_info, &rhs_info, nullptr, &output_info, pair.first);
+        const bool expected = (pair.second && arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()));
+
+        ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
+{
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using ShapeConfigurationTuple = std::tuple<TensorShape, TensorShape, TensorShape, bool>;
+    const std::vector<ShapeConfigurationTuple> shape_configurations =
+    {
+        { TensorShape(32U, 1U), TensorShape(3U, 32U), TensorShape(3U), true },
+        { TensorShape(16U, 12U), TensorShape(3U, 16U), TensorShape(3U), true },
+        { TensorShape(64U, 4U), TensorShape(2U, 64U), TensorShape(2U), true },
+        { TensorShape(16U, 4U), TensorShape(2U, 32U), TensorShape(2U), false }, // Mismatch in the K dimension
+        { TensorShape(16U, 0U), TensorShape(2U, 16U), TensorShape(2U), false }, // Invalid dimension
+        { TensorShape(32U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 32U, 3U, 4U, 5U, 6U), TensorShape(2U), true },
+        { TensorShape(32U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 32U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // no batch broadcasting
+        { TensorShape(32U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 32U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // mismatch in batch dimension
+        { TensorShape(32U, 1U), TensorShape(3U, 32U), TensorShape(1U), false },                                 // invalid broadcast of bias
+        { TensorShape(32U, 1U), TensorShape(3U, 32U), TensorShape(3U, 3U), false },                             // 2d bias is invalid
+        { TensorShape(12U, 12U), TensorShape(3U, 12U), TensorShape(3U), false },                                // K must be multiple of 16
+    };
+
+    for(auto &tuple : shape_configurations)
+    {
+        const bool expected = (std::get<3>(tuple) && arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()));
+
+        for(bool adj_lhs :
+            {
+                false, true
+            })
+        {
+            for(bool adj_rhs :
+                {
+                    false, true
+                })
+            {
+                TensorShape lhs_shape = std::get<0>(tuple);
+                TensorShape rhs_shape = std::get<1>(tuple);
+                TensorShape bia_shape = std::get<2>(tuple);
+
+                if(adj_lhs)
+                {
+                    permute(lhs_shape, PermutationVector(1U, 0U));
+                }
+
+                if(adj_rhs)
+                {
+                    permute(rhs_shape, PermutationVector(1U, 0U));
+                }
+
+                const TensorInfo lhs_info = TensorInfo(lhs_shape, 1, DataType::QASYMM8_SIGNED);
+                const TensorInfo rhs_info = TensorInfo(rhs_shape, 1, DataType::QASYMM8_SIGNED);
+                const TensorInfo bia_info = TensorInfo(bia_shape, 1, DataType::S32);
+                TensorInfo       output_info;
+
+                MatMulKernelInfo matmul_kernel_info{ adj_lhs, adj_rhs, 1, 1, 4, false /* export_rhs_to_cl_image */ };
+
+                Status status = ClMatMulLowpNativeMMULKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
+                ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+            }
+        }
+    }
+}
+
+TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
+{
+    using DataTypeConfigurationTuple = std::tuple<DataType, DataType, DataType, DataType, bool>;
+    const std::vector<DataTypeConfigurationTuple> data_type_configurations =
+    {
+        { DataType::F32, DataType::F32, DataType::F32, DataType::F32, false }, // no floating point types
+        { DataType::F16, DataType::F16, DataType::F16, DataType::F16, false }, // no floating point types
+        { DataType::F64, DataType::F64, DataType::F64, DataType::F64, false }, // no double precision
+        { DataType::QASYMM8, DataType::QASYMM8, DataType::S32, DataType::QASYMM8, true },
+        { DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::S32, DataType::QASYMM8_SIGNED, true },
+        { DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::S32, DataType::QSYMM8_PER_CHANNEL, false }, // only qasymm8/qasymm8_signed is supported
+        { DataType::QASYMM16, DataType::QASYMM16, DataType::S32, DataType::QASYMM16, false },                               // only qasymm8/qasymm8_signed is supported
+        { DataType::QSYMM16, DataType::QSYMM16, DataType::S32, DataType::QSYMM16, false },                                  // only qasymm8/qasymm8_signed is supported
+        { DataType::QSYMM8, DataType::QSYMM8, DataType::S32, DataType::QSYMM8, false },                                     // only qasymm8/qasymm8_signed is supported
+        { DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::QASYMM8, false },                           // no mixed data types
+        { DataType::S64, DataType::S64, DataType::S64, DataType::S64, false },                                              // no integral types
+        { DataType::S32, DataType::S32, DataType::S32, DataType::S32, false },                                              // no integral types
+        { DataType::S16, DataType::S16, DataType::S16, DataType::S16, false },                                              // no integral types
+        { DataType::S8, DataType::S8, DataType::S8, DataType::S8, false },                                                  // no integral types
+        { DataType::U64, DataType::U64, DataType::U64, DataType::U64, false },                                              // no integral types
+        { DataType::U32, DataType::U32, DataType::U32, DataType::U32, false },                                              // no integral types
+        { DataType::U16, DataType::U16, DataType::U16, DataType::U16, false },                                              // no integral types
+        { DataType::U8, DataType::U8, DataType::U8, DataType::U8, false },                                                  // no integral types
+        { DataType::QASYMM8, DataType::QASYMM8, DataType::F32, DataType::QASYMM8, false }                                   // Only S32 bias is supported
+    };
+
+    // It's enough to test a single shape and block size configuration while checking data types
+    const TensorShape      shape     = TensorShape(48U, 48U);
+    const TensorShape      bia_shape = TensorShape(48U);
+    const MatMulKernelInfo matmul_kernel_info{ false, false, 1, 1, 4, false };
+    for(auto &tuple : data_type_configurations)
+    {
+        const bool expected = (std::get<4>(tuple) && arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()));
+
+        const TensorInfo lhs_info(shape, 1, std::get<0>(tuple));
+        const TensorInfo rhs_info(shape, 1, std::get<1>(tuple));
+        const TensorInfo bia_info(bia_shape, 1, std::get<2>(tuple));
+        TensorInfo       output_info(shape, 1, std::get<3>(tuple));
+
+        Status status = ClMatMulLowpNativeMMULKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
+
+        ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_SUITE_END() // Validate
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULDataset(),
+                               make("TransposeA", { false, true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunWithBias, CLMatMulLowpNativeMMULKernelWithBiasFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULWithBiasDataset(),
+                               make("TransposeA", { false, true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeLhsNotTransposed, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulLowpMMULDataset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false, true }),
+                               m0_values_nightly_lhs_nt,
+                               n0_values_nightly,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposed, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulLowpMMULDataset(),
+                               make("TransposeA", { true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_nightly_lhs_t,
+                               n0_values_nightly,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+// Running High Dimensional test is enough for qasymm8_signed, because we're stressing the number of dimensions, not data type or M0/N0/K0
+// It's a good idea to test for each Lhs/Rhs T/NT combinations because they're different CL kernels
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::HighDimensionalMatMulLowpMMULDataset(),
+                               make("TransposeA", { false, true }),
+                               make("TransposeB", { false, true }),
+                               make("M0", { 2 }),
+                               make("N0", { 2 }),
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulLowpNativeMMULKernelFixture<uint8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULDatasetSubset(),
+                               make("TransposeA", { false, true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunWithBias, CLMatMulLowpNativeMMULKernelWithBiasFixture<uint8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULWithBiasDataset(),
+                               make("TransposeA", { false, true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeLhsNotTransposed, CLMatMulLowpNativeMMULKernelFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulLowpMMULDataset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false, true }),
+                               m0_values_nightly_lhs_nt,
+                               n0_values_nightly,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposed, CLMatMulLowpNativeMMULKernelFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulLowpMMULDataset(),
+                               make("TransposeA", { true }),
+                               make("TransposeB", { false, true }),
+                               m0_values_nightly_lhs_t,
+                               n0_values_nightly,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // MatMulLowpNativeMMULKernel
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/MatMulNativeMMULKernel.cpp b/tests/validation/CL/MatMulNativeMMULKernel.cpp
new file mode 100644
index 0000000000..655dd354dc
--- /dev/null
+++ b/tests/validation/CL/MatMulNativeMMULKernel.cpp
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h"
+#include "tests/datasets/LargeMatMulMMULDataset.h"
+#include "tests/datasets/SmallMatMulMMULDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/MatMulKernelFixture.h"
+#include "tests/validation/reference/Permute.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+constexpr float          abs_tolerance_f32(
+    0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for floating point data types in case using relative tolerance fails because of small values */
+constexpr float abs_tolerance_f16(
+    0.02f);                                                   /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data types in case using relative tolerance fails because of small values */
+RelativeTolerance<half_float::half> tolerance_f16(half(0.02)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+} // namespace
+
+/** M0 values to test --precommit*/
+const auto m0_values_precommit = framework::dataset::make("M0", { 1, 3 });
+
+/** N0 values to test --precommit*/
+const auto n0_values_precommit = framework::dataset::make("N0", { 2, 4 });
+
+/** M0 values to test --nightly*/
+const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 1, 2, 3, 4, 5, 6, 7, 8 });
+const auto m0_values_nightly_lhs_t  = framework::dataset::make("M0", { 1, 2, 3, 4, 8 });
+
+/** N0 values to test --nightly*/
+const auto n0_values_nightly_rhs_nt = framework::dataset::make("N0", { 1, 2, 3, 4, 8, 16 });
+const auto n0_values_nightly_rhs_t  = framework::dataset::make("N0", { 1, 2, 3, 4, 8 });
+
+/** K0 value -- Fixed to 1 */
+const auto k0_value = framework::dataset::make("K0", { 1 });
+
+template <typename T>
+using CLMatMulNativeMMULKernelFixture = MatMulKernelValidationFixture<T, ClMatMulNativeMMULKernel, true /*use_mmul*/>;
+
+template <typename T>
+using CLMatMulKernelBiasFixture = MatMulKernelWithBiasValidation<T, ClMatMulNativeMMULKernel, true /*use_mmul*/>;
+
+TEST_SUITE(CL)
+TEST_SUITE(MatMulNativeMMULKernel)
+TEST_SUITE(Validate)
+
+TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL)
+{
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        using MatMulConfigurationPair = std::pair<MatMulKernelInfo, bool>;
+
+        const std::vector<MatMulConfigurationPair> supported_block_sizes =
+        {
+            // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false)
+            // Lhs not-transposed, Rhs not-transposed
+            { MatMulKernelInfo(false, false, 0, 1, 1), false }, // M0 should be > 0
+            { MatMulKernelInfo(false, false, 3, 5, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+            { MatMulKernelInfo(false, false, 3, 6, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+            { MatMulKernelInfo(false, false, 3, 3, 4), false }, // K0 not 1
+            { MatMulKernelInfo(false, false, 9, 1, 1), true },
+            { MatMulKernelInfo(false, false, 3, 16, 1), true },
+            { MatMulKernelInfo(false, false, 7, 3, 1), true },
+
+            // Lhs transposed, Rhs not-transposed
+            { MatMulKernelInfo(true, false, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+            { MatMulKernelInfo(true, false, 3, 7, 1), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+            { MatMulKernelInfo(true, false, 6, 3, 1), false },  // M0 not in {1, 2, 3, 4, 8, 16}
+            { MatMulKernelInfo(true, false, 5, 3, 1), false },  // M0 not in {1, 2, 3, 4, 8, 16}
+            { MatMulKernelInfo(true, false, 2, 2, 2), false },  // K0 is not 1
+            { MatMulKernelInfo(true, false, 4, 1, 1), true },
+            { MatMulKernelInfo(true, false, 3, 3, 1), true },
+            { MatMulKernelInfo(true, false, 2, 4, 1), true },
+
+            // Lhs not-transposed, Rhs not-transposed
+            { MatMulKernelInfo(false, true, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8}
+            { MatMulKernelInfo(false, true, 2, 17, 1), false }, // N0 not in {1, 2, 3, 4, 8}
+            { MatMulKernelInfo(false, true, 4, 5, 1), false },  // N0 not in {1, 2, 3, 4, 8}
+            { MatMulKernelInfo(false, true, 4, 4, 7), false },  // K0 is not 1
+            { MatMulKernelInfo(false, true, 4, 7, 1), false },  // N0 not in {1, 2, 3, 4, 8}
+            { MatMulKernelInfo(false, true, 3, 8, 1), true },
+            { MatMulKernelInfo(false, true, 8, 16, 1), true },
+            { MatMulKernelInfo(false, true, 2, 4, 1), true },
+
+            // Lhs transposed, Rhs transposed
+            { MatMulKernelInfo(true, true, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+            { MatMulKernelInfo(true, true, 3, 7, 1), false },  // N0 not in {1, 2, 3, 4, 8, 16}
+            { MatMulKernelInfo(true, true, 6, 3, 1), false },  // M0 not in {1, 2, 3, 4, 8, 16}
+            { MatMulKernelInfo(true, true, 5, 3, 1), false },  // M0 not in {1, 2, 3, 4, 8, 16}
+            { MatMulKernelInfo(true, true, 4, 8, 2), false },  // K0 is not 1
+            { MatMulKernelInfo(true, true, 4, 8, 1), true },
+            { MatMulKernelInfo(true, true, 3, 3, 1), true },
+            { MatMulKernelInfo(true, true, 16, 4, 1), true },
+        };
+
+        // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal
+        // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here,
+        // not the shapes themselves.
+        const TensorInfo lhs_info = TensorInfo(TensorShape(100U, 100U), 1, DataType::F32);
+        const TensorInfo rhs_info = TensorInfo(TensorShape(100U, 100U), 1, DataType::F32);
+
+        for(auto &pair : supported_block_sizes)
+        {
+            TensorInfo output_info;
+            Status     status = ClMatMulNativeMMULKernel::validate(&lhs_info, &rhs_info, nullptr, &output_info, pair.first);
+            ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
+{
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+        using ShapeConfigurationTuple = std::tuple<TensorShape, TensorShape, TensorShape, bool>; // lhs, rhs, bias, result
+        const std::vector<ShapeConfigurationTuple> shape_configurations =
+        {
+            { TensorShape(4U, 1U), TensorShape(3U, 4U), TensorShape(3U), true },
+            { TensorShape(12U, 12U), TensorShape(3U, 12U), TensorShape(3U), true },
+            { TensorShape(8U, 4U), TensorShape(2U, 8U), TensorShape(2U), true },
+            { TensorShape(8U, 4U), TensorShape(2U, 4U), TensorShape(2U), false }, // Mismatch in the K dimension
+            { TensorShape(5U, 0U), TensorShape(2U, 5U), TensorShape(2U), false }, // Invalid dimension
+            { TensorShape(5U, 7U), TensorShape(2U, 5U), TensorShape(2U), false }, // K not a multiple of 4 (MMUL_K0)
+            { TensorShape(8U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 8U, 3U, 4U, 5U, 6U), TensorShape(2U), true },
+            { TensorShape(5U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // No batch broadcasting
+            { TensorShape(5U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // Mismatch in batch dimension
+            { TensorShape(4U, 1U), TensorShape(3U, 4U), TensorShape(1U), false },                                 // Bias first dimensions != dst first dimension.
+            { TensorShape(4U, 1U), TensorShape(3U, 4U), TensorShape(5U, 6U), false },                             // Bias is 2d which is invalid.
+        };
+
+        for(auto &tuple : shape_configurations)
+        {
+            const bool expected = std::get<3>(tuple);
+
+            for(bool adj_lhs :
+                {
+                    false, true
+                })
+            {
+                for(bool adj_rhs :
+                    {
+                        false, true
+                    })
+                {
+                    TensorShape lhs_shape = std::get<0>(tuple);
+                    TensorShape rhs_shape = std::get<1>(tuple);
+                    TensorShape bia_shape = std::get<2>(tuple);
+
+                    if(adj_lhs)
+                    {
+                        permute(lhs_shape, PermutationVector(1U, 0U));
+                    }
+
+                    if(adj_rhs)
+                    {
+                        permute(rhs_shape, PermutationVector(1U, 0U));
+                    }
+
+                    const TensorInfo lhs_info = TensorInfo(lhs_shape, 1, DataType::F32);
+                    const TensorInfo rhs_info = TensorInfo(rhs_shape, 1, DataType::F32);
+                    const TensorInfo bia_info = TensorInfo(bia_shape, 1, DataType::F32);
+                    TensorInfo       output_info;
+
+                    MatMulKernelInfo matmul_kernel_info{ adj_lhs, adj_rhs, 1, 1, 1, false /* export_rhs_to_cl_image */ };
+
+                    Status status = ClMatMulNativeMMULKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
+                    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+                }
+            }
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
+{
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+        using DataTypeConfigurationTuple = std::tuple<DataType, DataType, DataType, DataType, bool>;
+        const std::vector<DataTypeConfigurationTuple> data_type_configurations =
+        {
+            { DataType::F32, DataType::F32, DataType::F32, DataType::F32, true },
+            { DataType::F16, DataType::F16, DataType::F16, DataType::F16, true },
+            { DataType::F32, DataType::F32, DataType::F32, DataType::F32, true },
+            { DataType::F32, DataType::F32, DataType::F16, DataType::F32, false },                                              // incorrect bias type
+            { DataType::F16, DataType::F32, DataType::F32, DataType::F32, false },                                              // no mixed precision
+            { DataType::F64, DataType::F64, DataType::F64, DataType::F64, false },                                              // no double precision
+            { DataType::QASYMM8, DataType::QASYMM8, DataType::S32, DataType::QASYMM8, false },                                  // no quantized types
+            { DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::S32, DataType::QASYMM8_SIGNED, false },             // no quantized types
+            { DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::S32, DataType::QSYMM8_PER_CHANNEL, false }, // no quantized types
+            { DataType::QASYMM16, DataType::QASYMM16, DataType::S32, DataType::QASYMM16, false },                               // no quantized types
+            { DataType::QSYMM16, DataType::QSYMM16, DataType::S32, DataType::QSYMM16, false },                                  // no quantized types
+            { DataType::QSYMM8, DataType::QSYMM8, DataType::S32, DataType::QSYMM8, false },                                     // no quantized types
+            { DataType::S64, DataType::S64, DataType::S64, DataType::S64, false },                                              // no integral types
+            { DataType::S32, DataType::S32, DataType::S32, DataType::S32, false },                                              // no integral types
+            { DataType::S16, DataType::S16, DataType::S16, DataType::S16, false },                                              // no integral types
+            { DataType::S8, DataType::S8, DataType::S8, DataType::S8, false },                                                  // no integral types
+            { DataType::U64, DataType::U64, DataType::U64, DataType::U64, false },                                              // no integral types
+            { DataType::U32, DataType::U32, DataType::U32, DataType::U32, false },                                              // no integral types
+            { DataType::U16, DataType::U16, DataType::U16, DataType::U16, false },                                              // no integral types
+            { DataType::U8, DataType::U8, DataType::U8, DataType::U8, false },                                                  // no integral types
+        };
+
+        const TensorShape      shape     = TensorShape(8U, 8U);
+        const TensorShape      bia_shape = TensorShape(8U);
+        const MatMulKernelInfo matmul_kernel_info{ false, false, 1, 1, 1, false };
+        for(auto &tuple : data_type_configurations)
+        {
+            const bool expected = std::get<4>(tuple);
+
+            const TensorInfo lhs_info(shape, 1, std::get<0>(tuple));
+            const TensorInfo rhs_info(shape, 1, std::get<1>(tuple));
+            const TensorInfo bia_info(bia_shape, 1, std::get<2>(tuple));
+            TensorInfo       output_info(shape, 1, std::get<3>(tuple));
+
+            Status status = ClMatMulNativeMMULKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
+            ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+TEST_SUITE_END() // Validate
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+TEST_SUITE(Buffer)
+FIXTURE_DATA_TEST_CASE(RunTiny, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::TinyMatMulMMULDataset(),
+                                                                                                                     framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                     framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                     m0_values_precommit),
+                                                                                                                     n0_values_precommit),
+                                                                                                                     k0_value),
+                                                                                                                     framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                             framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulMMULDataset(),
+                                                                                                                      framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                      framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                      m0_values_precommit),
+                                                                                                                      n0_values_precommit),
+                                                                                                                      k0_value),
+                                                                                                                      framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                              framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunWithBias, CLMatMulKernelBiasFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulMMULDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                   framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                   m0_values_precommit),
+                                                                                                                   n0_values_precommit),
+                                                                                                                   k0_value),
+                                                                                                                   framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                           framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeNoTranspose, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_nt),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTranspose, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposed, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { true })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               m0_values_nightly_lhs_t),
+                                                       n0_values_nightly_rhs_nt),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposedRhsTransposed, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { true })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_t),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+// Running High Dimensional test is enough for FP32, because we're stressing the number of dimensions, not data type or M0/N0/K0
+// It's a good idea to test for each Lhs/Rhs T/NT combinations because they're different CL kernels
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, CLMatMulNativeMMULKernelFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::HighDimensionalMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { false, true })),
+                                                                       framework::dataset::make("TransposeB", { false, true })),
+                                                               framework::dataset::make("M0", { 2 })),
+                                                       framework::dataset::make("N0", { 2 })),
+                                               framework::dataset::make("K0", { 1 })),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+}
+TEST_SUITE_END() // Buffer
+
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+TEST_SUITE(Buffer)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulNativeMMULKernelFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulMMULDataset(),
+                                                                                                                     framework::dataset::make("TransposeA", { false, true })),
+                                                                                                                     framework::dataset::make("TransposeB", { false, true })),
+                                                                                                                     m0_values_precommit),
+                                                                                                                     n0_values_precommit),
+                                                                                                                     k0_value),
+                                                                                                                     framework::dataset::make("ExportRhsToCLImage", { false })),
+                                                                                                             framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeNoTranspose, CLMatMulNativeMMULKernelFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_nt),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTranspose, CLMatMulNativeMMULKernelFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { false })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_nt),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposed, CLMatMulNativeMMULKernelFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { true })),
+                                                                       framework::dataset::make("TransposeB", { false })),
+                                                               m0_values_nightly_lhs_t),
+                                                       n0_values_nightly_rhs_nt),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLargeLhsTransposedRhsTransposed, CLMatMulNativeMMULKernelFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulMMULDataset(),
+                                                                               framework::dataset::make("TransposeA", { true })),
+                                                                       framework::dataset::make("TransposeB", { true })),
+                                                               m0_values_nightly_lhs_t),
+                                                       n0_values_nightly_rhs_t),
+                                               k0_value),
+                                       framework::dataset::make("ExportRhsToCLImage", { false })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    // Validate output
+    if(_device_supports_mmul)
+    {
+        validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+}
+TEST_SUITE_END() // Buffer
+
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // MatMulNativeMMULKernel
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/MaxUnpoolingLayer.cpp b/tests/validation/CL/MaxUnpoolingLayer.cpp
index 6cba8b8bd5..cf4fcdda70 100644
--- a/tests/validation/CL/MaxUnpoolingLayer.cpp
+++ b/tests/validation/CL/MaxUnpoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,20 +51,19 @@ const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(framework::datase
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(MaxUnpooling, CLMaxUnpoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, CLMaxUnpoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
                                                                                                                    framework::dataset::make("DataType", DataType::F32))),
                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
 
                                                                                                                   ))
 {
-    printf("validate\n");
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(MaxUnpooling, CLMaxUnpoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, CLMaxUnpoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
                                                                                                                   framework::dataset::make("DataType", DataType::F16))),
                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
 
diff --git a/tests/validation/CL/MeanStdDevNormalizationLayer.cpp b/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
index e77a21ed7f..cdeb622130 100644
--- a/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -78,7 +78,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLMeanStdDevNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
                        framework::dataset::make("DataType", DataType::F16)),
                        framework::dataset::make("InPlace", { false, true })),
-                       framework::dataset::make("Epsilon", { 1e-8 })))
+                       framework::dataset::make("Epsilon", { 1e-3 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/CL/NegLayer.cpp b/tests/validation/CL/NegLayer.cpp
index 690b8f44fb..c93e31dca9 100644
--- a/tests/validation/CL/NegLayer.cpp
+++ b/tests/validation/CL/NegLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -50,6 +50,15 @@ TEST_SUITE(NegLayer)
 template <typename T>
 using CLNegLayerFixture = NegValidationFixture<CLTensor, CLAccessor, CLNegLayer, T>;
 
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLNegLayerFixture<int>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                    DataType::S32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLNegLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
diff --git a/tests/validation/CL/NormalizationLayer.cpp b/tests/validation/CL/NormalizationLayer.cpp
index 1aed2786ff..8c1890842b 100644
--- a/tests/validation/CL/NormalizationLayer.cpp
+++ b/tests/validation/CL/NormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,33 +62,28 @@ const auto NormalizationDatasetFP32 = combine(combine(combine(framework::dataset
 TEST_SUITE(CL)
 TEST_SUITE(NormalizationLayer)
 
-//TODO(COMPMID-415): Missing configuration?
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching data type input/output
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching shapes
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Even normalization
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Non implemented IN_MAP_2D
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Window shrink
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Windows shrinking for NCHW
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                      }),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F16),
                                                        TensorInfo(TensorShape(27U, 11U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("NormInfo",  { NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                                        NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                                        NormalizationLayerInfo(NormType::IN_MAP_1D, 4),
                                                        NormalizationLayerInfo(NormType::IN_MAP_2D, 5),
-                                                       NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                                        NormalizationLayerInfo(NormType::CROSS_MAP, 5),
                                                       })),
-               framework::dataset::make("Expected", { false, false, false, false, false, true })),
+               framework::dataset::make("Expected", { false, false, false, false, true })),
                input_info, output_info, norm_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLNormalizationLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), norm_info)) == expected, framework::LogLevel::ERRORS);
diff --git a/tests/validation/CL/PReluLayer.cpp b/tests/validation/CL/PReluLayer.cpp
index 82f3e4f806..f3f1c8b1b8 100644
--- a/tests/validation/CL/PReluLayer.cpp
+++ b/tests/validation/CL/PReluLayer.cpp
@@ -56,7 +56,7 @@ const auto PReluLayerQASYMM8Dataset = combine(combine(framework::dataset::make("
 const auto PReluLayerQASYMM8SIGNEDDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                     framework::dataset::make("DataType",
                                                                              DataType::QASYMM8_SIGNED));
-const auto PReluLayerS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
+const auto PReluLayerS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
                                           framework::dataset::make("DataType", DataType::S16));
 const auto PReluLayerFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
                                            framework::dataset::make("DataType", DataType::F16));
@@ -71,21 +71,18 @@ TEST_SUITE(PReluLayer)
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false})),
+               framework::dataset::make("Expected", { true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLPReluLayer::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
@@ -200,6 +197,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPReluLayerFixture<int16_t>, framework::Datase
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+FIXTURE_DATA_TEST_CASE(RunOneDimensional, CLPReluLayerFixture<int16_t>, framework::DatasetMode::ALL, combine(framework::dataset::make("Shape", TensorShape(1U, 16U)), PReluLayerS16Dataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
 TEST_SUITE_END()
 
 TEST_SUITE(Float)
diff --git a/tests/validation/CL/PadLayer.cpp b/tests/validation/CL/PadLayer.cpp
index 370195b078..ea0cb32785 100644
--- a/tests/validation/CL/PadLayer.cpp
+++ b/tests/validation/CL/PadLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/graph/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
+#include "src/graph/mutators/MutatorUtils.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/Globals.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -110,6 +112,63 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
     ARM_COMPUTE_EXPECT(bool(CLPadLayer::validate(&input_info.clone()->set_is_resizable(true), &output_info.clone()->set_is_resizable(true), padding, PixelValue(), mode)) == expected, framework::LogLevel::ERRORS);
 }
 
+DATA_TEST_CASE(CheckFusingWithConvolution, framework::DatasetMode::ALL, zip(zip(
+                framework::dataset::make("DataLayout",  { DataLayout::NCHW,
+                                                          DataLayout::NCHW,
+                                                          DataLayout::NCHW,
+                                                          DataLayout::NCHW,
+                                                          DataLayout::NCHW,
+                                                          DataLayout::NCHW,
+                                                          DataLayout::NCHW,
+                                                          DataLayout::NCHW,
+                                                          DataLayout::NHWC,
+                                                          DataLayout::NHWC,
+                                                          DataLayout::NHWC,
+                                                          DataLayout::NHWC,
+                                                          DataLayout::NHWC,
+                                                          DataLayout::NHWC,
+                                                          DataLayout::NHWC,
+                                                          DataLayout::UNKNOWN
+                                                        }),
+                framework::dataset::make("PaddingList", { PaddingList({{0, 0}, {1, 1}, {1, 1}}),          // nchw
+                                                          PaddingList({{1, 1}, {1, 1}, {0, 0}, {0, 0}}),
+                                                          PaddingList({{1, 1}, {1, 1}}),
+                                                          PaddingList({}),
+                                                          PaddingList({{0, 0}}),
+                                                          PaddingList({{0, 0}, {0, 0}, {0, 0}, {0, 0}}),
+                                                          PaddingList({{0, 0}, {0, 0}, {0, 0}, {1, 0}}),
+                                                          PaddingList({{0, 1}}),
+                                                          PaddingList({{0, 0}, {1, 1}, {1, 1}}),          // nhwc
+                                                          PaddingList({{0, 0}, {0, 0}, {1, 1}, {1, 1}}),
+                                                          PaddingList({{0, 0}, {1, 0}, {1, 1}, {0, 0}}),
+                                                          PaddingList({}),
+                                                          PaddingList({{0, 0}}),
+                                                          PaddingList({{0, 1}}),
+                                                          PaddingList({{0, 0}, {1, 1}}),
+                                                          PaddingList({{0, 0}})
+                                                        })),                           // unknown
+                framework::dataset::make("Expected",    { false,    // nchw
+                                                          true,
+                                                          true,
+                                                          true,
+                                                          true,
+                                                          true,
+                                                          false,
+                                                          true,
+                                                          true,     // nhwc
+                                                          false,
+                                                          true,
+                                                          true,
+                                                          true,
+                                                          false,
+                                                          true,
+                                                          false     // unknown
+                                                        })),
+                data_layout, padding_list, expected)
+{
+    ARM_COMPUTE_EXPECT(expected == arm_compute::graph::is_padding_in_height_or_width(data_layout, padding_list), framework::LogLevel::ERRORS);
+}
+
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/CL/PixelWiseMultiplication.cpp b/tests/validation/CL/PixelWiseMultiplication.cpp
index 70e618efa1..62ff15a37f 100644
--- a/tests/validation/CL/PixelWiseMultiplication.cpp
+++ b/tests/validation/CL/PixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,9 @@ namespace test
 {
 namespace validation
 {
+/** Synced with tests/validation/dynamic_fusion/gpu/cl/Mul.cpp from the dynamic fusion interface.
+ * Please check there for any differences in the coverage
+ */
 namespace
 {
 namespace
@@ -50,9 +53,6 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.75f, 0.25f),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.75f, 0.25f)
 });
-// Since in-place computation on CL-side hasn't been intended to be implemented, they are not tested.
-// However, this dataset is required for the shared fixture and it would make extension easier when
-// CL-side also starts supporting in-place computation.
 const auto InPlaceDataSet = framework::dataset::make("InPlace", { false });
 } //namespace
 // *INDENT-OFF*
@@ -81,7 +81,9 @@ using CLPixelWiseMultiplicationToF16Fixture = PixelWiseMultiplicationValidationF
 template <typename T>
 using CLPixelWiseMultiplicationToF32Fixture = PixelWiseMultiplicationValidationFloatFixture<CLTensor, CLAccessor, CLPixelWiseMultiplication, T, float>;
 template <typename T>
-using CLPixelWiseMultiplicationBroadcastFixture = PixelWiseMultiplicationBroadcastValidationFloatFixture<CLTensor, CLAccessor, CLPixelWiseMultiplication, T, float>;
+using CLPixelWiseMultiplicationToF32BroadcastFixture = PixelWiseMultiplicationBroadcastValidationFloatFixture<CLTensor, CLAccessor, CLPixelWiseMultiplication, T, float>;
+template <typename T>
+using CLPixelWiseMultiplicationIntegerFixture = PixelWiseMultiplicationValidationIntegerFixture<CLTensor, CLAccessor, CLPixelWiseMultiplication, T, int>;
 
 TEST_SUITE(CL)
 TEST_SUITE(PixelWiseMultiplication)
@@ -91,27 +93,24 @@ TEST_SUITE(PixelWiseMultiplication)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Window shrink
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid scale
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Scale",{  2.f, 2.f, 2.f, -1.f, 1.f, 1.f})),
-               framework::dataset::make("Expected", { true, true, false, false, false, false})),
+               framework::dataset::make("Scale",{  2.f, 2.f, -1.f, 1.f, 1.f})),
+               framework::dataset::make("Expected", { true, true, false, false, false})),
                input1_info, input2_info, output_info, scale, expected)
 {
     bool has_error = bool(CLPixelWiseMultiplication::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), scale, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO));
@@ -119,6 +118,33 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 }
 // clang-format on
 // *INDENT-ON*
+TEST_SUITE(INT32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationIntegerFixture<int>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(combine(
+                                                                           datasets::SmallShapes(),
+                                                                           framework::dataset::make("DataType1", DataType::S32)),
+                                                                       framework::dataset::make("DataType2", DataType::S32)),
+                                                               framework::dataset::make("Scale", { 1.f })),
+                                                       datasets::ConvertPolicies()),
+                                               framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_UP)),
+                                       EmptyActivationFunctionsDataset),
+                               InPlaceDataSet))
+{
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunInplace, CLPixelWiseMultiplicationIntegerFixture<int>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::TinyShapes(),
+                                                                               framework::dataset::make("DataType1", DataType::S32)),
+                                                                       framework::dataset::make("DataType2", DataType::S32)),
+                                                               framework::dataset::make("Scale", { 1.f })),
+                                                       datasets::ConvertPolicies()),
+                                               framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_UP)),
+                                       EmptyActivationFunctionsDataset),
+                               framework::dataset::make("InPlace", { true })))
+{
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
 
 TEST_SUITE(F16toF16)
 TEST_SUITE(Scale255)
@@ -133,18 +159,36 @@ TEST_SUITE(F32toF32)
 TEST_SUITE(Scale255)
 PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, PRECOMMIT, SmallShapes(), F32, F32, scale_255, TO_NEAREST_UP, EmptyActivationFunctionsDataset, VALIDATE(float, 1.f))
 PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunWithActivation, ToF32Fixture<float>, ALL, TinyShapes(), F32, F32, scale_255, TO_NEAREST_UP, ActivationFunctionsDataset, VALIDATE(float, 1.f))
+FIXTURE_DATA_TEST_CASE(RunInplace, CLPixelWiseMultiplicationToF32Fixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::TinyShapes(),
+                                                                               framework::dataset::make("DataTypeIn1", DataType::F32)),
+                                                                       framework::dataset::make("DataTypeIn2", DataType::F32)),
+                                                               framework::dataset::make("Scale", { scale_255 })),
+                                                       datasets::ConvertPolicies()),
+                                               framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_UP)),
+                                       EmptyActivationFunctionsDataset),
+                               framework::dataset::make("InPlace", { true })))
+{
+    // Validate output
+    VALIDATE(float, 1.f)
+}
 TEST_SUITE_END() // Scale255
 TEST_SUITE_END() // F32toF32
 
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, BroadcastFixture<float>, PRECOMMIT, SmallShapesBroadcast(), F32, F32, scale_255, TO_NEAREST_UP, EmptyActivationFunctionsDataset,
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, ToF32BroadcastFixture<float>, PRECOMMIT, SmallShapesBroadcast(), F32, F32, scale_255, TO_NEAREST_UP,
+                                                 EmptyActivationFunctionsDataset,
                                                  VALIDATE(float, 1.f))
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunWithActivationSmallBroadcast, BroadcastFixture<float>, ALL, TinyShapesBroadcast(), F32, F32, scale_255, TO_NEAREST_UP, ActivationFunctionsDataset,
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunWithActivationSmallBroadcast, ToF32BroadcastFixture<float>, ALL, TinyShapesBroadcast(), F32, F32, scale_255, TO_NEAREST_UP,
+                                                 ActivationFunctionsDataset,
                                                  VALIDATE(float, 1.f))
 
 template <typename T>
 using CLPixelWiseMultiplicationQuantizedFixture   = PixelWiseMultiplicationValidationQuantizedFixture<CLTensor, CLAccessor, CLPixelWiseMultiplication, T, T>;
 using CLPixelWiseMultiplicationQSYMM16ToS32Fxture = PixelWiseMultiplicationValidationQuantizedFixture<CLTensor, CLAccessor, CLPixelWiseMultiplication, int16_t, int16_t, int32_t>;
 
+template <typename T>
+using CLPixelWiseMultiplicationQuantizedBroadcastFixture = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<CLTensor, CLAccessor, CLPixelWiseMultiplication, T, T>;
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
@@ -163,6 +207,41 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQuantizedFixture<uint8
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLPixelWiseMultiplicationQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                                               framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                                                       framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                                               framework::dataset::make("Scale", { 1.f, 2.f })),
+                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                               framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("OUtQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               InPlaceDataSet))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunInplace, CLPixelWiseMultiplicationQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                                               framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                                                       framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                                               framework::dataset::make("Scale", { 1.f, 2.f })),
+                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                               framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("OUtQInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                               framework::dataset::make("InPlace", { true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
diff --git a/tests/validation/CL/Pooling3dLayer.cpp b/tests/validation/CL/Pooling3dLayer.cpp
new file mode 100644
index 0000000000..84d630e6cf
--- /dev/null
+++ b/tests/validation/CL/Pooling3dLayer.cpp
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/TensorShape.h"
+#include "tests/framework/datasets/Datasets.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLPooling3dLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/Pooling3dLayerDataset.h"
+#include "tests/datasets/PoolingTypesDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/Pooling3dLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/** Input data sets for floating-point data types */
+const auto Pooling3dLayerDatasetFP = combine(combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { Size3D(2, 3, 2) })),
+                                                             framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(2, 2, 1) })),
+                                                     framework::dataset::make("Padding", { Padding3D(0, 1, 0), Padding3D(1, 1, 1) })),
+                                             framework::dataset::make("ExcludePadding", { true, false }));
+
+const auto Pooling3dLayerDatasetFPSmall = combine(combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { Size3D(2, 2, 2), Size3D(3, 3, 3) })),
+                                                                  framework::dataset::make("Stride", { Size3D(2, 2, 2), Size3D(2, 1, 1) })),
+                                                          framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                  framework::dataset::make("ExcludePadding", { true, false }));
+
+const auto Pooling3DLayerDatasetQuantized = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                                            framework::dataset::make("PoolingSize", { Size3D(2, 3, 2) })),
+                                                                    framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(1, 1, 2), Size3D(2, 2, 1)})),
+                                                            framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                    framework::dataset::make("ExcludePadding", { true }));
+
+using ShapeDataset = framework::dataset::ContainerDataset<std::vector<TensorShape>>;
+
+constexpr AbsoluteTolerance<float>   tolerance_f32(0.001f);       /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+constexpr AbsoluteTolerance<float>   tolerance_f16(0.1f);         /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8_SIGNED integer datatype*/
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);        /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric type */
+
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(Pooling3dLayer)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC), // Mismatching data type
+                                                       TensorInfo(TensorShape(2U, 27U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC), // Invalid pad/size combination
+                                                       TensorInfo(TensorShape(2U, 27U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC), // Invalid pad/size combination
+                                                       TensorInfo(TensorShape(2U, 27U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC), // Invalid output shape
+                                                       TensorInfo(TensorShape(5U, 13U, 15U, 2U, 3U), 1, DataType::F32, DataLayout::NDHWC), // Global Pooling
+                                                       TensorInfo(TensorShape(13U,13U, 5U, 1U, 2U), 1, DataType::F32, DataLayout::NDHWC),  // Invalid output Global Pooling
+                                                       TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NDHWC), // Invalid data type
+                                                       TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(5U, 13U, 13U, 5U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(1U, 16U,  1U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(5U, 13U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                                     }),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(2U, 25U, 11U, 3U, 3U), 1, DataType::F16, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(2U, 30U, 11U, 3U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(2U, 25U, 16U, 3U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(2U, 27U, 13U, 3U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(5U,  1U,  1U, 1U, 3U), 1, DataType::F32, DataLayout::NDHWC), // Global pooling applied
+                                                       TensorInfo(TensorShape(5U,  2U,  2U, 2U, 2U), 1, DataType::F32, DataLayout::NDHWC), // Invalid output Global Pooling
+                                                       TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::QASYMM8, DataLayout::NDHWC), // Invalid data type
+                                                       TensorInfo(TensorShape(5U,  1U, 1U, 1U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(1U, 15U, 1U, 2U, 4U), 1, DataType::F32, DataLayout::NDHWC), // Output width larger than input
+                                                       TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(5U, 6U, 6U, 2U, 2U),  1, DataType::F32, DataLayout::NDHWC), 
+                                                       TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                                       TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                                     })),
+               framework::dataset::make("PoolInfo",  { Pooling3dLayerInfo(PoolingType::AVG, 3, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                                       Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1, 1, 1), Padding3D(2, 0, 0)),
+                                                       Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                                       Pooling3dLayerInfo(PoolingType::L2,  3, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                                       Pooling3dLayerInfo(PoolingType::AVG),
+                                                       Pooling3dLayerInfo(PoolingType::MAX),
+                                                       Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(), Padding3D(), false),
+                                                       Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1U, 1U, 1U), Padding3D(), false),
+                                                       Pooling3dLayerInfo(PoolingType::AVG),
+                                                       Pooling3dLayerInfo(PoolingType::MAX, 2, Size3D(1, 1, 2), Padding3D(0, 0, 0), false),
+                                                       Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(2U, 2U, 2U), Padding3D(), false),
+                                                       Pooling3dLayerInfo(PoolingType::AVG, 1, Size3D(2U, 2U, 2U), Padding3D(2, 2, 2), true), // Pool size is smaller than the padding size with padding excluded
+                                                       Pooling3dLayerInfo(PoolingType::AVG, 1, Size3D(2U, 2U, 2U), Padding3D(2, 2, 2), false), // Pool size is smaller than the padding size with padding included
+                                                       Pooling3dLayerInfo(PoolingType::AVG, 3, Size3D(2U, 2U, 2U), Padding3D(2,1,2,2,1,2), false, false, DimensionRoundingType::CEIL), // CEIL with asymmetric Padding
+                                                      })),
+               framework::dataset::make("Expected", { false, false, false, false, true, false, false, false, true , false, true, false, false, false})),
+               input_info, output_info, pool_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLPooling3dLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info)) == expected, framework::LogLevel::ERRORS);
+}
+
+
+template <typename T>
+using CLPooling3dLayerFixture = Pooling3dLayerValidationFixture<CLTensor, CLAccessor, CLPooling3dLayer, T>;
+
+template <typename T>
+using CLSpecialPooling3dLayerFixture = SpecialPooling3dLayerValidationFixture<CLTensor, CLAccessor, CLPooling3dLayer, T>;
+
+template <typename T>
+using CLPooling3dLayerGlobalFixture = Pooling3dLayerGlobalValidationFixture<CLTensor, CLAccessor, CLPooling3dLayer, T>;
+
+template <typename T>
+using CLPooling3dLayerQuantizedFixture = Pooling3dLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLPooling3dLayer, T>;
+
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE(QUANTIZED)
+
+TEST_SUITE(QASYMM8)
+// Small Dataset Quantized Dataset
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                       combine(Pooling3DLayerDatasetQuantized,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, 10), QuantizationInfo(1.f / 127.f, 10) })),
+                                                                                                                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, 5), QuantizationInfo(1.f / 127.f, 10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+// Large Dataset Quantized Dataset
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large5dShapes(),
+                                                                                                                       combine(Pooling3DLayerDatasetQuantized,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, 10), QuantizationInfo(1.f / 127.f, 10) })),
+                                                                                                                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, 5), QuantizationInfo(1.f / 127.f, 10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+// Large Dataset Quantized Dataset Signed
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                      combine(Pooling3DLayerDatasetQuantized,
+                                                                                                                              framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                      framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10), QuantizationInfo(1.f / 127.f, -10) })),
+                                                                                                                      framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -5), QuantizationInfo(1.f / 127.f, -10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+// Large Dataset Quantized pooling test
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large5dShapes(),
+                                                                                                                    combine(Pooling3DLayerDatasetQuantized,
+                                                                                                                            framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                    framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10), QuantizationInfo(1.f / 127.f, -10) })),
+                                                                                                                    framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -5), QuantizationInfo(1.f / 127.f, -10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunSpecial, CLSpecialPooling3dLayerFixture<float>, framework::DatasetMode::ALL, datasets::Pooling3dLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small5dShapes(), combine(Pooling3dLayerDatasetFPSmall,
+                                                                                                            framework::dataset::make("DataType", DataType::F32))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFP,
+                                                                                                          framework::dataset::make("DataType", DataType::F32))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(3U, 27U, 13U, 4U),
+                                                                             TensorShape(4U, 27U, 13U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(27, 13, 4) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallGlobal, CLPooling3dLayerGlobalFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 4U, 3U),
+                                                                             TensorShape(27U, 13U, 4U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(4U, 79U, 37U, 11U),
+                                                                             TensorShape(4U, 79U, 37U, 11U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(79, 37, 11) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small5x5Shapes(), combine(Pooling3dLayerDatasetFPSmall,
+                                                                                                           framework::dataset::make("DataType", DataType::F16))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFP,
+                                                                                                         framework::dataset::make("DataType", DataType::F16))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(3U, 27U, 13U, 4U),
+                                                                             TensorShape(4U, 27U, 13U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(27, 13, 4) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallGlobal, CLPooling3dLayerGlobalFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 4U, 3U),
+                                                                             TensorShape(27U, 13U, 4U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(4U, 79U, 37U, 11U),
+                                                                             TensorShape(4U, 79U, 37U, 11U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(79, 37, 11) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // Pooling3dLayer
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index de5c9f2e8d..9fe28c7acf 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -101,6 +101,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),     // Invalid output Global Pooling
                                                        TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(1U, 16U, 1U),  1, DataType::F32),
                                                      }),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
                                                        TensorInfo(TensorShape(30U, 11U, 2U), 1, DataType::F32),
@@ -110,6 +111,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        TensorInfo(TensorShape(2U, 2U, 5U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(12U, 12U, 5U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(1U, 1U, 5U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(1U, 15U, 1U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("PoolInfo",  { PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
                                                        PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NCHW, PadStrideInfo(1, 1, 2, 0)),
@@ -119,8 +121,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        PoolingLayerInfo(PoolingType::MAX, DataLayout::NCHW),
                                                        PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NHWC, PadStrideInfo(), false),
                                                        PoolingLayerInfo(PoolingType::AVG, DataLayout::NCHW),
+                                                       PoolingLayerInfo(PoolingType::MAX, 2, DataLayout::NHWC, PadStrideInfo(1, 1, 0, 0), false),
                                                       })),
-               framework::dataset::make("Expected", { false, false, false, false, true, false, true, true })),
+               framework::dataset::make("Expected", { false, false, false, false, true, false, true, true , false})),
                input_info, output_info, pool_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLPoolingLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info)) == expected, framework::LogLevel::ERRORS);
@@ -131,6 +134,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 
 template <typename T>
 using CLPoolingLayerFixture = PoolingLayerValidationFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
+template <typename T>
+using CLPoolingLayerMixedDataLayoutFixture = PoolingLayerValidationFixture<CLTensor, CLAccessor, CLPoolingLayer, T, true>;
 
 template <typename T>
 using CLSpecialPoolingLayerFixture = SpecialPoolingLayerValidationFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
@@ -148,7 +153,7 @@ FIXTURE_DATA_TEST_CASE(RunSpecial, CLSpecialPoolingLayerFixture<float>, framewor
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerDatasetFPSmall,
                                                                                                                   framework::dataset::make("DataType",
                                                                                                                           DataType::F32))),
                                                                                                           pool_data_layout_dataset))
@@ -156,6 +161,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<float>, framework::Datase
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLPoolingLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(datasets::PoolingTypes(),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(2, 1, 0, 0) })),
+                                       framework::dataset::make("ExcludePadding", { false })),
+                               framework::dataset::make("DataType", DataType::F32))),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
                                                                                                                 framework::dataset::make("DataType",
                                                                                                                         DataType::F32))),
@@ -165,21 +181,61 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<float>, framework::Datase
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmallIndices, CLPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPIndicesSmall,
-                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                                DataType::F32))),
-                                                                                                                        pool_data_layout_dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallIndices, CLPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                        combine(PoolingLayerDatasetFPIndicesSmall,
+                                                                                                                                framework::dataset::make("DataType",
+                                                                                                                                        DataType::F32))),
+                                                                                                                        pool_data_layout_dataset),framework::dataset::make("UseKernelIndices", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
     validate(CLAccessor(_target_indices), _ref_indices);
 }
 
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U),
+                                                                             TensorShape(27U, 13U, 2U, 4U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size2D(27, 13) })),
+                                    framework::dataset::make("PadStride", PadStrideInfo(1, 1, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F32)),
+                                    framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(79U, 37U, 11U),
+                                                                             TensorShape(79U, 37U, 11U, 4U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size2D(79, 37) })),
+                                    framework::dataset::make("PadStride", PadStrideInfo(1, 1, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F32)),
+                                    framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // GlobalPooling
+
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMixedPrecesionPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPSmall,
-                                                                                                                       framework::dataset::make("DataType", DataType::F16))),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMixedPrecesionPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                       combine(PoolingLayerDatasetFPSmall,
+                                                                                                                               framework::dataset::make("DataType", DataType::F16))),
                                                                                                                        pool_data_layout_dataset),
                                                                                                                        pool_fp_mixed_precision_dataset))
 {
@@ -194,15 +250,55 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLMixedPrecesionPoolingLayerFixture<half>, fram
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunSmallIndices, CLPoolingLayerIndicesFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPIndicesSmall,
-                                                                                                                       framework::dataset::make("DataType",
-                                                                                                                               DataType::F16))),
-                                                                                                                       pool_data_layout_dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallIndices, CLPoolingLayerIndicesFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                       combine(PoolingLayerDatasetFPIndicesSmall,
+                                                                                                                               framework::dataset::make("DataType",
+                                                                                                                                       DataType::F16))),
+                                                                                                                       pool_data_layout_dataset), framework::dataset::make("UseKernelIndices", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
     validate(CLAccessor(_target_indices), _ref_indices);
 }
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U),
+                                                                             TensorShape(27U, 13U, 2U, 4U)
+                                                                            }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size2D(27, 13) })),
+                                    framework::dataset::make("PadStride", PadStrideInfo(1, 1, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F16)),
+                                    framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(79U, 37U, 11U),
+                                                                             TensorShape(79U, 37U, 11U, 4U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size2D(79, 37) })),
+                                    framework::dataset::make("PadStride", PadStrideInfo(1, 1, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F16)),
+                                    framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // GlobalPooling
+
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
@@ -210,9 +306,11 @@ TEST_SUITE(Quantized)
 
 template <typename T>
 using CLPoolingLayerQuantizedFixture = PoolingLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
+template <typename T>
+using CLPoolingLayerQuantizedMixedDataLayoutFixture = PoolingLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLPoolingLayer, T, true>;
 
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
                                                                                                                      combine(PoolingLayerDatasetQASYMM8Small,
                                                                                                                              framework::dataset::make("DataType", DataType::QASYMM8))),
                                                                                                                      pool_data_layout_dataset),
@@ -222,10 +320,23 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<uint8_t>, framew
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLPoolingLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
+                                       framework::dataset::make("ExcludePadding", { true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 255.f, 10) })),
+                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
                                                                                                                     combine(PoolingLayerDatasetQASYMM8Small,
                                                                                                                             framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
                                                                                                                     pool_data_layout_dataset),
@@ -235,6 +346,19 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<int8_t>, framewo
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8_s);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLPoolingLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
+                                       framework::dataset::make("ExcludePadding", { true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })),
+                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_s);
+}
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // PoolingLayer
diff --git a/tests/validation/CL/ROIPoolingLayer.cpp b/tests/validation/CL/ROIPoolingLayer.cpp
new file mode 100644
index 0000000000..eb16c1baec
--- /dev/null
+++ b/tests/validation/CL/ROIPoolingLayer.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/Globals.h"
+#include "tests/datasets/ROIDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ROIPoolingLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> relative_tolerance_f32(0.01f);
+AbsoluteTolerance<float> absolute_tolerance_f32(0.001f);
+
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+} // end namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(RoiPooling)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Successful test
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::QASYMM8), // Successful test (quantized)
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Incorrect rois type
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching data type input/output
+                                                       TensorInfo(TensorShape(250U, 128U, 2U), 1, DataType::F32), // Mismatching depth size input/output
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching number of rois and output batch size
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Invalid number of values per ROIS
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching height and width input/output
+
+                                                     }),
+               framework::dataset::make("RoisInfo", { TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::F16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 10U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(4, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                    })),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(5U, 5U, 3U, 4U), 1, DataType::F32),
+                                                     })),
+               framework::dataset::make("PoolInfo", { ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      })),
+               framework::dataset::make("Expected", { true, true, false, false, false, false, false })),
+               input_info, rois_info, output_info, pool_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLROIPoolingLayer::validate(&input_info.clone()->set_is_resizable(true), &rois_info.clone()->set_is_resizable(true), &output_info.clone()->set_is_resizable(true), pool_info)) == expected, framework::LogLevel::ERRORS);
+}
+
+using CLROIPoolingLayerFloatFixture = ROIPoolingLayerFixture<CLTensor, CLAccessor, CLROIPoolingLayer, float>;
+
+TEST_SUITE(Float)
+FIXTURE_DATA_TEST_CASE(Small, CLROIPoolingLayerFloatFixture, framework::DatasetMode::ALL,
+                       framework::dataset::combine(framework::dataset::combine(datasets::SmallROIDataset(),
+                                                    framework::dataset::make("DataType", { DataType::F32 })),
+                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, relative_tolerance_f32, .02f, absolute_tolerance_f32);
+}
+
+TEST_SUITE_END() // Float test suite end
+
+// Begin quantized tests
+template <typename T>
+using CLROIPoolingLayerQuantizedFixture = ROIPoolingLayerQuantizedFixture<CLTensor, CLAccessor, CLROIPoolingLayer, T>;
+
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(Small, CLROIPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(datasets::SmallROIDataset(),
+                                                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 127) })),
+                               framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(2.f / 255.f, 120) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // end qasymm8 tests
+
+TEST_SUITE_END() // RoiPooling
+TEST_SUITE_END() // NEON
+
+} // validation namespace end
+} // test namespace end
+} // arm_compute namespace end
diff --git a/tests/validation/CL/ReduceMean.cpp b/tests/validation/CL/ReduceMean.cpp
index 947f84af49..8a8fa4aef0 100644
--- a/tests/validation/CL/ReduceMean.cpp
+++ b/tests/validation/CL/ReduceMean.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,7 @@ constexpr AbsoluteTolerance<float>   tolerance_f32(0.001f); /**< Tolerance value
 constexpr AbsoluteTolerance<float>   tolerance_f16(0.03f);  /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);  /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric quantized type */
 
-const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(0, 1, 2, 3) }),
+const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(2, 3), Coordinates(0, 1, 2, 3) }),
                                framework::dataset::make("KeepDims", { true }));
 const auto axis_drop = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1), Coordinates(3), Coordinates(1, 2), Coordinates(2, 1) }), framework::dataset::make("KeepDims", { false }));
 } // namespace
diff --git a/tests/validation/CL/ReductionOperation.cpp b/tests/validation/CL/ReductionOperation.cpp
index 31c5a97925..beb58381ca 100644
--- a/tests/validation/CL/ReductionOperation.cpp
+++ b/tests/validation/CL/ReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,10 +50,11 @@ RelativeTolerance<float> rel_tolerance_f16(0.2f);
 /** Tolerance for quantized operations */
 RelativeTolerance<float> tolerance_qasymm8(1);
 
-const auto ReductionOperationsSumProd = framework::dataset::make("ReductionOperationsSumProd",
+const auto ReductionOperationsSumProdMean = framework::dataset::make("ReductionOperationsSumProdMean",
 {
     ReductionOperation::SUM,
     ReductionOperation::PROD,
+    ReductionOperation::MEAN_SUM
 
 });
 const auto ReductionOperationsMinMax = framework::dataset::make("ReductionMinMax",
@@ -109,15 +110,16 @@ using CLReductionOperationFixture = ReductionOperationFixture<CLTensor, CLAccess
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall4D, CLReductionOperationFixture<half>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProd,
-                                       ReductionOperationsMinMax)),
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
+                                       concat(ReductionOperationsSumProdMean,
+                                              ReductionOperationsMinMax)),
                                KeepDimensions))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture<half>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProd,
+                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProdMean,
                                        ReductionOperationsMinMax)),
                                KeepDimensions))
 {
@@ -127,15 +129,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture<half>, framework::D
 TEST_SUITE_END() // F16
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall4D, CLReductionOperationFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProd,
-                                       ReductionOperationsMinMax)),
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
+                                       concat(ReductionOperationsSumProdMean,
+                                              ReductionOperationsMinMax)),
                                KeepDimensions))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProd,
+                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProdMean,
                                        ReductionOperationsMinMax)),
                                KeepDimensions))
 {
@@ -152,7 +155,7 @@ TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLReductionOperationQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                               ReductionOperationsSumProd),
+                                               ReductionOperationsSumProdMean),
                                        framework::dataset::make("QuantizationInfo", QuantizationInfo(1.f / 64, 2))),
                                KeepDimensions))
 {
@@ -172,7 +175,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLReductionOperationQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                               ReductionOperationsSumProd),
+                                               ReductionOperationsSumProdMean),
                                        framework::dataset::make("QuantizationInfo", QuantizationInfo(1.f / 64, 2))),
                                KeepDimensions))
 {
diff --git a/tests/validation/CL/Reverse.cpp b/tests/validation/CL/Reverse.cpp
index 11df0e7803..82effc2136 100644
--- a/tests/validation/CL/Reverse.cpp
+++ b/tests/validation/CL/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,9 +41,10 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
-auto run_small_dataset = combine(datasets::SmallShapes(), datasets::Tiny1DShapes());
+auto run_small_dataset = combine(datasets::Small3DShapes(), datasets::Tiny1DShapes());
 auto run_large_dataset = combine(datasets::LargeShapes(), datasets::Tiny1DShapes());
 
 } // namespace
@@ -53,33 +54,34 @@ TEST_SUITE(Reverse)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
+        make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis shape
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis length (> 4)
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Mismatching shapes
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         }),
-        framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
+        make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         })),
-        framework::dataset::make("AxisInfo",{ TensorInfo(TensorShape(3U), 1, DataType::U8),
+        make("AxisInfo",{ TensorInfo(TensorShape(3U), 1, DataType::U8),
                                            TensorInfo(TensorShape(2U, 10U), 1, DataType::U32),
                                            TensorInfo(TensorShape(8U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
         })),
-        framework::dataset::make("Expected", { false, false, false, false, true, true})),
+        make("Expected", { false, false, false, false, true, true})),
         src_info, dst_info, axis_info, expected)
 {
     Status s = CLReverse::validate(&src_info.clone()->set_is_resizable(false),
                                   &dst_info.clone()->set_is_resizable(false),
-                                  &axis_info.clone()->set_is_resizable(false));
+                                  &axis_info.clone()->set_is_resizable(false),
+                                  false);
     ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -93,7 +95,11 @@ TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLReverseFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -102,7 +108,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLReverseFixture<half>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -113,7 +123,11 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLReverseFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -122,7 +136,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLReverseFixture<float>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -135,7 +153,11 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLReverseFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -144,7 +166,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLReverseFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/CL/RoundLayer.cpp b/tests/validation/CL/RoundLayer.cpp
index 5aa9ca6b4e..f0c88d1ad3 100644
--- a/tests/validation/CL/RoundLayer.cpp
+++ b/tests/validation/CL/RoundLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/RsqrtLayer.cpp b/tests/validation/CL/RsqrtLayer.cpp
index 29c113b105..2353bda8d3 100644
--- a/tests/validation/CL/RsqrtLayer.cpp
+++ b/tests/validation/CL/RsqrtLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -42,8 +42,11 @@ namespace validation
 {
 namespace
 {
-RelativeTolerance<float> tolerance_fp32(0.000001f);
-RelativeTolerance<float> tolerance_fp16(0.001f);
+RelativeTolerance<float>             tolerance_fp32(0.000001f);
+RelativeTolerance<float>             tolerance_fp16(0.001f);
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);   /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_s(1); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric type */
+
 } // namespace
 TEST_SUITE(CL)
 TEST_SUITE(RsqrtLayer)
@@ -68,6 +71,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
 // *INDENT-ON*
 template <typename T>
 using CLRsqrtLayerFixture = RsqrtValidationFixture<CLTensor, CLAccessor, CLRsqrtLayer, T>;
+template <typename T>
+using CLRsqrtLayerQuantizedFixture = RsqrtQuantizedValidationFixture<CLTensor, CLAccessor, CLRsqrtLayer, T>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
@@ -102,6 +107,30 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLRsqrtLayerFixture<float>, framework::DatasetM
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLRsqrtLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::QASYMM8_SIGNED)),
+                                                                                                                  framework::dataset::make("SrcQInfo", { QuantizationInfo(0.4044, -128) })),
+                                                                                                                  framework::dataset::make("OutQInfo", { QuantizationInfo(0.0027, -128) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_s);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLRsqrtLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::QASYMM8)),
+                                                                                                                   framework::dataset::make("SrcQInfo", { QuantizationInfo(0.4044, 0) })),
+                                                                                                                   framework::dataset::make("OutQInfo", { QuantizationInfo(0.0027, 0) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // RsqrtLayer
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/CL/Scale.cpp b/tests/validation/CL/Scale.cpp
index 523b49deb7..10a99ae34f 100644
--- a/tests/validation/CL/Scale.cpp
+++ b/tests/validation/CL/Scale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -82,6 +82,7 @@ constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
 constexpr float                      tolerance_f32_absolute(0.001f);
 
 RelativeTolerance<float> tolerance_f32(0.05);
+constexpr float          abs_tolerance_f16(0.1f);
 RelativeTolerance<half>  tolerance_f16(half(0.1));
 
 constexpr float tolerance_num_f32(0.01f);
@@ -186,16 +187,6 @@ TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
-TEST_CASE(WindowShrink, framework::DatasetMode::ALL)
-{
-    const auto input  = TensorInfo{ TensorShape(37U, 37U, 2U), 1, DataType::F32 };
-    const auto output = TensorInfo{ TensorShape(39U, 55U, 2U), 1, DataType::F32 };
-    Status     result{};
-
-    result = CLScale::validate(&input.clone()->set_is_resizable(false), &output.clone()->set_is_resizable(false), ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
-    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
-}
-
 TEST_CASE(IncorrectScaleFactor, framework::DatasetMode::ALL)
 {
     const auto     input                = TensorInfo{ TensorShape(28U, 33U, 2U), 1, DataType::F32 };
@@ -210,6 +201,8 @@ TEST_SUITE_END() // Validate
 
 template <typename T>
 using CLScaleFixture = ScaleValidationFixture<CLTensor, CLAccessor, CLScale, T>;
+template <typename T>
+using CLScaleMixedDataLayoutFixture = ScaleValidationFixture<CLTensor, CLAccessor, CLScale, T, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
@@ -223,6 +216,15 @@ FIXTURE_DATA_TEST_CASE(Run, CLScaleFixture<float>, framework::DatasetMode::ALL,
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLScaleMixedDataLayoutFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
 FIXTURE_DATA_TEST_CASE(RunAlignCorners, CLScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
@@ -261,7 +263,7 @@ FIXTURE_DATA_TEST_CASE(Run, CLScaleFixture<half>, framework::DatasetMode::ALL, A
     const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunAlignCorners, CLScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleAlignCornersSamplingPolicySet))
 {
@@ -270,7 +272,7 @@ FIXTURE_DATA_TEST_CASE(RunAlignCorners, CLScaleFixture<half>, framework::Dataset
     const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
 }
 const auto f16_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
 FIXTURE_DATA_TEST_CASE(RunNightly, CLScaleFixture<half>, framework::DatasetMode::NIGHTLY, ASSEMBLE_DATASET(f16_nightly_shape, ScaleSamplingPolicySet))
@@ -280,7 +282,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly, CLScaleFixture<half>, framework::DatasetMode:
     const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners, CLScaleFixture<half>, framework::DatasetMode::NIGHTLY, ASSEMBLE_DATASET(f16_nightly_shape, ScaleAlignCornersSamplingPolicySet))
 {
@@ -289,7 +291,7 @@ FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners, CLScaleFixture<half>, framework::
     const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
 }
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
diff --git a/tests/validation/CL/ScatterLayer.cpp b/tests/validation/CL/ScatterLayer.cpp
new file mode 100644
index 0000000000..b1531eb64a
--- /dev/null
+++ b/tests/validation/CL/ScatterLayer.cpp
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLScatter.h"
+#include "tests/validation/fixtures/ScatterLayerFixture.h"
+#include "tests/datasets/ScatterDataset.h"
+#include "tests/CL/CLAccessor.h"
+#include "arm_compute/function_info/ScatterInfo.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */
+RelativeTolerance<float> tolerance_f16(0.02f); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */
+RelativeTolerance<int32_t> tolerance_int(0); /**< Tolerance value for comparing reference's output against implementation's output for integer data types */
+} // namespace
+
+template <typename T>
+using CLScatterLayerFixture = ScatterValidationFixture<CLTensor, CLAccessor, CLScatter, T>;
+
+using framework::dataset::make;
+
+TEST_SUITE(CL)
+TEST_SUITE(Scatter)
+DATA_TEST_CASE(Validate, framework::DatasetMode::PRECOMMIT, zip(
+    make("InputInfo", { TensorInfo(TensorShape(9U), 1, DataType::F32),    // Mismatching data types
+                        TensorInfo(TensorShape(15U), 1, DataType::F32),   // Valid
+                        TensorInfo(TensorShape(15U), 1, DataType::U8),   // Valid
+                        TensorInfo(TensorShape(8U), 1, DataType::F32),
+                        TensorInfo(TensorShape(217U), 1, DataType::F32),    // Mismatch input/output dims.
+                        TensorInfo(TensorShape(217U), 1, DataType::F32),    // Updates dim higher than Input/Output dims.
+                        TensorInfo(TensorShape(12U), 1, DataType::F32),     // Indices wrong datatype.
+                        TensorInfo(TensorShape(9U, 3U, 4U), 1, DataType::F32), // Number of updates != number of indices
+                        TensorInfo(TensorShape(17U, 3U, 3U, 2U), 1, DataType::F32), // index_len != (dst_dims - upt_dims + 1)
+                        TensorInfo(TensorShape(17U, 3U, 3U, 2U, 2U, 2U), 1, DataType::F32), // index_len > 5
+    }),
+    make("UpdatesInfo",{TensorInfo(TensorShape(3U), 1, DataType::F16),
+                        TensorInfo(TensorShape(15U), 1, DataType::F32),
+                        TensorInfo(TensorShape(15U), 1, DataType::U8),
+                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                        TensorInfo(TensorShape(217U), 1, DataType::F32),
+                        TensorInfo(TensorShape(217U, 3U), 1, DataType::F32),
+                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                        TensorInfo(TensorShape(9U, 3U, 2U), 1, DataType::F32),
+                        TensorInfo(TensorShape(17U, 3U, 2U), 1, DataType::F32),
+                        TensorInfo(TensorShape(1U), 1, DataType::F32),
+    }),
+    make("IndicesInfo",{TensorInfo(TensorShape(1U, 3U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 15U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 15U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 2U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 271U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 271U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 2U), 1 , DataType::F32),
+                        TensorInfo(TensorShape(1U, 4U), 1, DataType::S32),
+                        TensorInfo(TensorShape(3U, 2U), 1, DataType::S32),
+                        TensorInfo(TensorShape(6U, 2U), 1, DataType::S32),
+    }),
+    make("OutputInfo",{TensorInfo(TensorShape(9U), 1, DataType::F16),
+                       TensorInfo(TensorShape(15U), 1, DataType::F32),
+                       TensorInfo(TensorShape(15U), 1, DataType::U8),
+                       TensorInfo(TensorShape(8U), 1, DataType::F32),
+                       TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
+                       TensorInfo(TensorShape(271U), 1, DataType::F32),
+                       TensorInfo(TensorShape(12U), 1, DataType::F32),
+                       TensorInfo(TensorShape(9U, 3U, 4U), 1, DataType::F32),
+                       TensorInfo(TensorShape(17U, 3U, 3U, 2U), 1, DataType::F32),
+                       TensorInfo(TensorShape(17U, 3U, 3U, 2U, 2U, 2U), 1, DataType::F32),
+    }),
+    make("ScatterInfo",{ ScatterInfo(ScatterFunction::Add, false),
+                         ScatterInfo(ScatterFunction::Max, false),
+                         ScatterInfo(ScatterFunction::Max, false),
+                         ScatterInfo(ScatterFunction::Min, false),
+                         ScatterInfo(ScatterFunction::Add, false),
+                         ScatterInfo(ScatterFunction::Update, false),
+                         ScatterInfo(ScatterFunction::Sub, false),
+                         ScatterInfo(ScatterFunction::Sub, false),
+                         ScatterInfo(ScatterFunction::Update, false),
+                         ScatterInfo(ScatterFunction::Update, false),
+    }),
+    make("Expected", { false, true, true, true, false, false, false, false, false, false })),
+    input_info, updates_info, indices_info, output_info, scatter_info, expected)
+{
+    const Status status = CLScatter::validate(&input_info, &updates_info, &indices_info, &output_info, scatter_info);
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+
+const auto allScatterFunctions = make("ScatterFunction",
+    {ScatterFunction::Update, ScatterFunction::Add, ScatterFunction::Sub, ScatterFunction::Min, ScatterFunction::Max });
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::Small1DScatterDataset(),
+        make("DataType", {DataType::F32}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {true})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+// With this test, src should be passed as nullptr.
+FIXTURE_DATA_TEST_CASE(RunSmallZeroInit, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::Small1DScatterDataset(),
+        make("DataType", {DataType::F32}),
+        make("ScatterFunction", {ScatterFunction::Add}),
+        make("ZeroInit", {true}),
+        make("Inplace", {false}),
+        make("Padding", {true})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+// Updates/src/dst have same no. dims.
+FIXTURE_DATA_TEST_CASE(RunSmallMultiDim, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMultiDimDataset(),
+        make("DataType", {DataType::F32}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {true})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+// m+1-D to m+n-D cases
+FIXTURE_DATA_TEST_CASE(RunSmallMultiIndices, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMultiIndicesDataset(),
+        make("DataType", {DataType::F32}),
+        make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add }),
+        make("ZeroInit", {false}),
+        make("Inplace", {false, true}),
+        make("Padding", {true})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+// m+k, k-1-D m+n-D case
+FIXTURE_DATA_TEST_CASE(RunSmallBatchedMultiIndices, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterBatchedDataset(),
+        make("DataType", {DataType::F32}),
+        make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add}),
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {true})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+// m+k, k-1-D m+n-D case
+FIXTURE_DATA_TEST_CASE(RunSmallScatterScalar, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterScalarDataset(),
+        make("DataType", {DataType::F32}),
+        make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add}),
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false}))) // NOTE: Padding not supported in this datset
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // FP32
+
+
+// NOTE: Padding is disabled for the SmallScatterMixedDataset due certain shapes not supporting padding.
+//       Padding is well tested in F32 Datatype test cases.
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::F16}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
+
+TEST_SUITE(Integer)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<int32_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::S32}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // S32
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<int16_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::S16}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE(S8)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::S8}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // S8
+
+TEST_SUITE(U32)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<uint32_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::U32}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // U32
+
+TEST_SUITE(U16)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<uint16_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::U16}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::U8}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // U8
+TEST_SUITE_END() // Integer
+
+TEST_SUITE_END() // Scatter
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/Select.cpp b/tests/validation/CL/Select.cpp
index 3d7c61aab5..d3540cae48 100644
--- a/tests/validation/CL/Select.cpp
+++ b/tests/validation/CL/Select.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -107,6 +107,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
     validate(CLAccessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunOneDim,
+                       CLSelectFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(framework::dataset::make("Shape", TensorShape(1U, 16U)),
+                                       framework::dataset::make("has_same_rank", { false, true })),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLSelectFixture<half>,
                        framework::DatasetMode::NIGHTLY,
@@ -127,6 +138,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
     validate(CLAccessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunOneDim,
+                       CLSelectFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(framework::dataset::make("Shape", TensorShape(1U, 16U)),
+                                       framework::dataset::make("has_same_rank", { false, true })),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLSelectFixture<float>,
                        framework::DatasetMode::NIGHTLY,
@@ -149,6 +171,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
     validate(CLAccessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunOneDim,
+                       CLSelectFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(framework::dataset::make("Shape", TensorShape(1U, 16U)),
+                                       framework::dataset::make("has_same_rank", { false, true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLSelectFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
diff --git a/tests/validation/CL/SinLayer.cpp b/tests/validation/CL/SinLayer.cpp
index e40c990db6..f0cb4c314e 100644
--- a/tests/validation/CL/SinLayer.cpp
+++ b/tests/validation/CL/SinLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index 396e274e0b..eb47b7f666 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,12 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/SoftmaxLayerFixture.h"
 
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/PoolManager.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/CL/CLBufferAllocator.h"
+#include "arm_compute/runtime/BlobMemoryPool.h"
+
 namespace arm_compute
 {
 namespace test
@@ -62,6 +68,47 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 TEST_SUITE(CL)
 TEST_SUITE(SoftmaxLayer)
 
+TEST_CASE(SimpleMemoryManaged, framework::DatasetMode::ALL)
+{
+    // The purpose of this test is to test if the function can
+    // run correctly even with the given memory manager from its caller
+    // (Similar scenario when the library is integrated into other software)
+    // especially when working with workspace() method of
+    // @ref arm_compute::opencl::ClSoftmax.
+    const auto shape = TensorShape{4,2};    // Random shape, not important
+    constexpr auto dt = DataType::F32;      // Random data type, not important
+
+    // Create a memory manager
+    auto lm = std::make_shared<BlobLifetimeManager>();
+    auto pm = std::make_shared<arm_compute::PoolManager>();
+    auto alloc = std::make_unique<CLBufferAllocator>();
+    auto mm = std::make_shared<MemoryManagerOnDemand>(lm, pm);
+
+    auto src = create_tensor<CLTensor>(shape, dt);
+    auto dst = create_tensor<CLTensor>(shape, dt);
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    // Create the function with the memory manager
+    CLSoftmaxLayer smx(mm);
+    smx.configure(&src, &dst);
+
+    // Populate the memory, acquire() will happen in run()
+    mm->populate(*alloc.get(), 1);
+
+    std::vector<float> input_vals{0.0f, 1.0f, 0.0f, 0.0f, 0.5f, 0.0f, 0.0f, 0.0f,};
+    library->fill_static_values(CLAccessor(src), input_vals);
+
+    smx.run();
+
+    // Compute reference to compare
+    SimpleTensor<float> ref_src{shape, dt};
+    library->fill_static_values(ref_src, input_vals);
+    auto ref_dst = reference::softmax_layer<float>(ref_src, 1., 0, false);
+
+    validate(CLAccessor(dst), ref_dst);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
diff --git a/tests/validation/CL/Tile.cpp b/tests/validation/CL/Tile.cpp
index a06c05744f..f243780c00 100644
--- a/tests/validation/CL/Tile.cpp
+++ b/tests/validation/CL/Tile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,7 @@ namespace validation
 namespace
 {
 const auto MultiplesDataset = framework::dataset::make("Multiples", { Multiples{ 3 },
+                                                                      Multiples{ 7 },
                                                                       Multiples{ 2, 2 },
                                                                       Multiples{ 1, 1, 3, 4 },
                                                                       Multiples{ 2, 1, 2, 2 },
diff --git a/tests/validation/CL/Transpose.cpp b/tests/validation/CL/Transpose.cpp
index 943534058b..6cf5fe8537 100644
--- a/tests/validation/CL/Transpose.cpp
+++ b/tests/validation/CL/Transpose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,12 +50,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
     framework::dataset::make("InputInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::U16), // Invalid shape
                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::U8),  // Wrong data type
                                             TensorInfo(TensorShape(20U, 16U), 1, DataType::U32), // Valid
+                                            TensorInfo(TensorShape(20U, 16U, 3U, 3U), 1, DataType::U16), // Transpose only first two dimensions
                                           }),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(13U, 20U), 1, DataType::U32),
                                             TensorInfo(TensorShape(31U, 20U), 1, DataType::U16),
                                             TensorInfo(TensorShape(16U, 20U), 1, DataType::U32),
+                                            TensorInfo(TensorShape(16U, 20U, 3U, 3U), 1, DataType::U16),
                                            })),
-    framework::dataset::make("Expected", { false, false, true })),
+    framework::dataset::make("Expected", { false, false, true, true })),
     a_info, output_info, expected)
 {
     // Lock tensors
@@ -80,6 +82,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLTransposeFixture<uint8_t>, framework::Dataset
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+FIXTURE_DATA_TEST_CASE(RunLargeHighDimensional,
+                       CLTransposeFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(concat(concat(datasets::Large3DShapes(), datasets::Large4DShapes()),
+                                      datasets::Large5dShapes()),
+                               framework::dataset::make("DataType", DataType::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
 TEST_SUITE_END() // U8
 
 TEST_SUITE(U16)
@@ -106,6 +118,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLTransposeFixture<uint32_t>, framework::Datase
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallHighDimensional,
+                       CLTransposeFixture<uint32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(concat(datasets::Small3DShapes(), datasets::Small4DShapes()),
+                               framework::dataset::make("DataType", DataType::U32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge, CLTransposeFixture<uint32_t>, framework::DatasetMode::NIGHTLY, combine(concat(datasets::Large1DShapes(), datasets::Large2DShapes()),
                                                                                                         framework::dataset::make("DataType", DataType::U32)))
 {
diff --git a/tests/validation/CL/UNIT/DynamicTensor.cpp b/tests/validation/CL/UNIT/DynamicTensor.cpp
index 833256039e..ac433721d8 100644
--- a/tests/validation/CL/UNIT/DynamicTensor.cpp
+++ b/tests/validation/CL/UNIT/DynamicTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,11 +29,8 @@
 #include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/PoolManager.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "src/core/CL/kernels/CLIm2ColKernel.h"
 #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
-#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/Globals.h"
diff --git a/tests/validation/CL/UNIT/Multithreaded.cpp b/tests/validation/CL/UNIT/Multithreaded.cpp
new file mode 100644
index 0000000000..5c75df709d
--- /dev/null
+++ b/tests/validation/CL/UNIT/Multithreaded.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/RuntimeContext.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/ParametersLibrary.h"
+#include "tests/validation/Validation.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/PixelWiseMultiplication.h"
+#include <thread>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(RuntimeContext)
+// This test tries scheduling work concurrently from two independent threads
+TEST_CASE(MultipleThreadedScheduller, framework::DatasetMode::ALL)
+{
+    constexpr auto num_threads(16u);
+    std::array<CLActivationLayer, num_threads>         func{};
+    std::array<CLPixelWiseMultiplication, num_threads> pmul{};
+    std::array<CLTensor, num_threads>                  s0{};
+    std::array<CLTensor, num_threads>                  s1{};
+
+    std::array<CLTensor, num_threads> st{};
+    std::array<CLTensor, num_threads> dt{};
+
+    const TensorShape         tensor_shape(128u, 4u, 5u);
+    const ActivationLayerInfo ainfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.5f, 1.f);
+    std::array<std::thread, num_threads> threads;
+    auto ctx = parameters->get_ctx<CLTensor>();
+
+    for(auto i = 0u; i < num_threads; ++i)
+    {
+        s0[i]   = create_tensor<CLTensor>(tensor_shape, DataType::F32, 1);
+        s1[i]   = create_tensor<CLTensor>(tensor_shape, DataType::F32, 1);
+        st[i]   = create_tensor<CLTensor>(tensor_shape, DataType::F32, 1);
+        dt[i]   = create_tensor<CLTensor>(tensor_shape, DataType::F32, 1);
+        func[i] = CLActivationLayer(ctx);
+        pmul[i] = CLPixelWiseMultiplication();
+        threads[i] =
+            std::thread([&,i]
+        {
+            auto &s  = st[i];
+            auto &t  = dt[i];
+            auto &p0 = s0[i];
+            auto &p1 = s1[i];
+            pmul[i].configure(&p0, &p1, &s, 1.f, ConvertPolicy::WRAP, RoundingPolicy::TO_NEAREST_UP);
+            func[i].configure(&s, &t, ainfo);
+            s.allocator()->allocate();
+            t.allocator()->allocate();
+            p0.allocator()->allocate();
+            p1.allocator()->allocate();
+            library->fill_tensor_uniform(CLAccessor(p0), 0, -1.f, 1.f);
+            library->fill_tensor_uniform(CLAccessor(p1), 0, -1.f, 1.f);
+            pmul[i].run();
+            func[i].run();
+        });
+    }
+
+    for(auto &t : threads)
+    {
+        t.join();
+    }
+
+    SimpleTensor<float> rs{ tensor_shape, DataType::F32, 1 };
+    SimpleTensor<float> ra{ tensor_shape, DataType::F32, 1 };
+    SimpleTensor<float> rb{ tensor_shape, DataType::F32, 1 };
+    library->fill_tensor_uniform(ra, 0, -1.f, 1.f);
+    library->fill_tensor_uniform(rb, 0, -1.f, 1.f);
+    const auto mul    = reference::pixel_wise_multiplication<float, float, float>(ra, rb, 1.f, ConvertPolicy::WRAP, RoundingPolicy::TO_NEAREST_UP, DataType::F32);
+    const auto golden = reference::activation_layer<float>(mul, ainfo);
+    for(auto &d : dt)
+    {
+        validate(CLAccessor(d), golden);
+    }
+}
+
+TEST_SUITE_END() // MultipleThreadedScheduller
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/UNIT/TensorAllocator.cpp b/tests/validation/CL/UNIT/TensorAllocator.cpp
index 3ccdd99fe3..559f47e16c 100644
--- a/tests/validation/CL/UNIT/TensorAllocator.cpp
+++ b/tests/validation/CL/UNIT/TensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,14 @@
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 
 #include "arm_compute/core/utils/misc/MMappedFile.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/CL/CLBufferAllocator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/PoolManager.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/Globals.h"
 #include "tests/framework/Asserts.h"
@@ -60,12 +65,108 @@ cl_mem import_malloc_memory_helper(void *ptr, size_t size)
 
     return buf;
 }
+
+class DummyAllocator final : public IAllocator
+{
+public:
+    DummyAllocator() = default;
+
+    void *allocate(size_t size, size_t alignment) override
+    {
+        ++_n_calls;
+        return _backend_allocator.allocate(size, alignment);
+    }
+    void free(void *ptr) override
+    {
+        return _backend_allocator.free(ptr);
+    }
+    std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override
+    {
+        // Needs to be implemented as is the one that is used internally by the CLTensorAllocator
+        ++_n_calls;
+        return _backend_allocator.make_region(size, alignment);
+    }
+    int get_n_calls() const
+    {
+        return _n_calls;
+    }
+
+private:
+    int               _n_calls{};
+    CLBufferAllocator _backend_allocator{};
+};
+
+void run_conv2d(std::shared_ptr<IMemoryManager> mm, IAllocator &mm_allocator)
+{
+    // Create tensors
+    CLTensor src, weights, bias, dst;
+    src.allocator()->init(TensorInfo(TensorShape(16U, 32U, 32U, 2U), 1, DataType::F32, DataLayout::NHWC));
+    weights.allocator()->init(TensorInfo(TensorShape(16U, 3U, 3U, 32U), 1, DataType::F32, DataLayout::NHWC));
+    bias.allocator()->init(TensorInfo(TensorShape(32U), 1, DataType::F32, DataLayout::NHWC));
+    dst.allocator()->init(TensorInfo(TensorShape(32U, 32U, 32U, 2U), 1, DataType::F32, DataLayout::NHWC));
+
+    // Create and configure function
+    CLGEMMConvolutionLayer conv(mm);
+    conv.configure(&src, &weights, &bias, &dst, PadStrideInfo(1U, 1U, 1U, 1U));
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    weights.allocator()->allocate();
+    bias.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    // Finalize memory manager
+    if(mm != nullptr)
+    {
+        mm->populate(mm_allocator, 1 /* num_pools */);
+        ARM_COMPUTE_EXPECT(mm->lifetime_manager()->are_all_finalized(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(mm->pool_manager()->num_pools() == 1, framework::LogLevel::ERRORS);
+    }
+
+    conv.run();
+}
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(UNIT)
 TEST_SUITE(TensorAllocator)
 
+/* Validate that an external global allocator can be used for all internal allocations */
+TEST_CASE(ExternalGlobalAllocator, framework::DatasetMode::ALL)
+{
+    DummyAllocator global_tensor_alloc;
+    CLTensorAllocator::set_global_allocator(&global_tensor_alloc);
+
+    // Run a convolution
+    run_conv2d(nullptr /* mm */, global_tensor_alloc);
+
+    // Check that allocator has been called multiple times > 4
+    ARM_COMPUTE_EXPECT(global_tensor_alloc.get_n_calls() > 4, framework::LogLevel::ERRORS);
+
+    // Nullify global allocator
+    CLTensorAllocator::set_global_allocator(nullptr);
+}
+
+/* Validate that an external global allocator can be used for the pool manager */
+TEST_CASE(ExternalGlobalAllocatorMemoryPool, framework::DatasetMode::ALL)
+{
+    auto lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+    auto pool_mgr     = std::make_shared<PoolManager>();
+    auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+    DummyAllocator global_tensor_alloc;
+    CLTensorAllocator::set_global_allocator(&global_tensor_alloc);
+
+    // Run a convolution
+    run_conv2d(mm, global_tensor_alloc);
+
+    // Check that allocator has been called multiple times > 4
+    ARM_COMPUTE_EXPECT(global_tensor_alloc.get_n_calls() > 4, framework::LogLevel::ERRORS);
+
+    // Nullify global allocator
+    CLTensorAllocator::set_global_allocator(nullptr);
+}
+
 /** Validates import memory interface when importing cl buffer objects */
 TEST_CASE(ImportMemoryBuffer, framework::DatasetMode::ALL)
 {
@@ -79,31 +180,31 @@ TEST_CASE(ImportMemoryBuffer, framework::DatasetMode::ALL)
     // Negative case : Import nullptr
     CLTensor t1;
     t1.allocator()->init(info);
-    ARM_COMPUTE_EXPECT(!bool(t1.allocator()->import_memory(cl::Buffer())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t1.allocator()->import_memory(cl::Buffer())));
+    ARM_COMPUTE_ASSERT(t1.info()->is_resizable());
 
     // Negative case : Import memory to a tensor that is memory managed
     CLTensor    t2;
     MemoryGroup mg;
     t2.allocator()->set_associated_memory_group(&mg);
-    ARM_COMPUTE_EXPECT(!bool(t2.allocator()->import_memory(buf)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t2.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t2.allocator()->import_memory(buf)));
+    ARM_COMPUTE_ASSERT(t2.info()->is_resizable());
 
     // Negative case : Invalid buffer size
     CLTensor         t3;
     const TensorInfo info_neg(TensorShape(32U, 16U, 3U), 1, DataType::F32);
     t3.allocator()->init(info_neg);
-    ARM_COMPUTE_EXPECT(!bool(t3.allocator()->import_memory(buf)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t3.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t3.allocator()->import_memory(buf)));
+    ARM_COMPUTE_ASSERT(t3.info()->is_resizable());
 
     // Positive case : Set raw pointer
     CLTensor t4;
     t4.allocator()->init(info);
-    ARM_COMPUTE_EXPECT(bool(t4.allocator()->import_memory(buf)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!t4.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(t4.allocator()->import_memory(buf)));
+    ARM_COMPUTE_ASSERT(!t4.info()->is_resizable());
     ARM_COMPUTE_EXPECT(t4.cl_buffer().get() == buf.get(), framework::LogLevel::ERRORS);
     t4.allocator()->free();
-    ARM_COMPUTE_EXPECT(t4.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(t4.info()->is_resizable());
     ARM_COMPUTE_EXPECT(t4.cl_buffer().get() != buf.get(), framework::LogLevel::ERRORS);
 }
 
@@ -141,8 +242,8 @@ TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
         std::align(alignment, total_size_in_bytes, aligned_ptr, space);
 
         cl::Buffer wrapped_buffer(import_malloc_memory_helper(aligned_ptr, total_size_in_bytes));
-        ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(wrapped_buffer)), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(wrapped_buffer)));
+        ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
         // Fill tensor
         std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -205,12 +306,12 @@ TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 
         // Map file
         utils::mmap_io::MMappedFile mmapped_file("test_mmap_import.bin", 0 /** Whole file */, 0);
-        ARM_COMPUTE_EXPECT(mmapped_file.is_mapped(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(mmapped_file.is_mapped());
         unsigned char *data = mmapped_file.data();
 
         cl::Buffer wrapped_buffer(import_malloc_memory_helper(data, total_size_in_bytes));
-        ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(wrapped_buffer)), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(wrapped_buffer)));
+        ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
         // Fill tensor
         std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -233,7 +334,7 @@ TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 
         // Release resources
         tensor.allocator()->free();
-        ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
     }
 }
 #endif // !defined(BARE_METAL)
diff --git a/tests/validation/CL/UNIT/Tuner.cpp b/tests/validation/CL/UNIT/Tuner.cpp
deleted file mode 100644
index cf2513bf2c..0000000000
--- a/tests/validation/CL/UNIT/Tuner.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
-#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-#include "tests/Utils.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(UNIT)
-TEST_SUITE(Tuner)
-
-/** Validates static tuning of Bifrost tuner */
-TEST_CASE(BifrostTunerSimple, framework::DatasetMode::ALL)
-{
-    // Create tuner
-    tuners::BifrostTuner tuner;
-
-    // Create tensors
-    auto src     = create_tensor<CLTensor>(TensorShape(13U, 13U, 16U), DataType::F32);
-    auto weights = create_tensor<CLTensor>(TensorShape(3U, 3U, 16U, 3U), DataType::F32);
-    auto bias    = create_tensor<CLTensor>(TensorShape(3U), DataType::F32);
-    auto dst     = create_tensor<CLTensor>(TensorShape(13U, 13U, 3U), DataType::F32);
-
-    // Create kernel
-    cl::NDRange                    fake_lws(2000);
-    CLDirectConvolutionLayerKernel conv;
-    conv.set_target(GPUTarget::G72);
-
-    // Configure
-    conv.configure(&src, &weights, &bias, &dst, PadStrideInfo(1, 1, 1, 1));
-
-    // Hard-wire lws to kernel and validate lws
-    conv.set_lws_hint(fake_lws);
-    ARM_COMPUTE_EXPECT(conv.lws_hint()[0] == 2000, framework::LogLevel::ERRORS);
-
-    // Tune kernel and validate
-    tuner.tune_kernel_static(conv);
-    ARM_COMPUTE_EXPECT(conv.lws_hint()[0] != 2000, framework::LogLevel::ERRORS);
-
-    // Clear tuner
-    CLScheduler::get().default_init();
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/CL/UNIT/WeightsRetention.cpp b/tests/validation/CL/UNIT/WeightsRetention.cpp
index acf795e48b..357c88af10 100644
--- a/tests/validation/CL/UNIT/WeightsRetention.cpp
+++ b/tests/validation/CL/UNIT/WeightsRetention.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,18 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
-#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/Globals.h"
diff --git a/tests/validation/CL/WeightsReshape.cpp b/tests/validation/CL/WeightsReshape.cpp
index d04c10cee2..4345c4b08a 100644
--- a/tests/validation/CL/WeightsReshape.cpp
+++ b/tests/validation/CL/WeightsReshape.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -41,7 +41,7 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(WeightsReshape)
 
-using CLWeightsReshape = CLSynthetizeFunction<CLWeightsReshapeKernel>;
+using ClWeightsReshape = ClSynthetizeOperatorWithBorder<opencl::kernels::ClWeightsReshapeKernel>;
 
 /** Validate tests
  *
@@ -87,15 +87,15 @@ framework::dataset::make("NumGroups", { 1, 1, 1, 2, 1, 2 })),
 framework::dataset::make("Expected", { false, false, false, false, false, false })),
 input_info, biases_info, output_info, num_groups, expected)
 {
-    bool status = bool(CLWeightsReshape::validate(&input_info, &biases_info, &output_info, num_groups));
+    bool status = bool(opencl::kernels::ClWeightsReshapeKernel::validate(&input_info, &biases_info, &output_info, num_groups));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 
 template <typename T>
-using CLWeightsReshapeFixture = WeightsReshapeValidationFixture<CLTensor, CLAccessor, CLWeightsReshape, T>;
+using ClWeightsReshapeFixture = WeightsReshapeOpValidationFixture<CLTensor, CLAccessor, ClWeightsReshape, T>;
 
 TEST_SUITE(Float)
-FIXTURE_DATA_TEST_CASE(FP32, CLWeightsReshapeFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("InputShape", { TensorShape(3U, 3U, 48U, 120U) }),
+FIXTURE_DATA_TEST_CASE(FP32, ClWeightsReshapeFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("InputShape", { TensorShape(3U, 3U, 48U, 120U) }),
                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
                                                                                                           framework::dataset::make("HasBias", { true, false })),
                                                                                                   framework::dataset::make("NumGroups", { 1, 2 })))
@@ -104,7 +104,7 @@ FIXTURE_DATA_TEST_CASE(FP32, CLWeightsReshapeFixture<float>, framework::DatasetM
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(FP16, CLWeightsReshapeFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("InputShape", { TensorShape(13U, 13U, 96U, 240U) }),
+FIXTURE_DATA_TEST_CASE(FP16, ClWeightsReshapeFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("InputShape", { TensorShape(13U, 13U, 96U, 240U) }),
                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
                                                                                                          framework::dataset::make("HasBias", { true, false })),
                                                                                                  framework::dataset::make("NumGroups", { 3, 4 })))
@@ -113,7 +113,7 @@ FIXTURE_DATA_TEST_CASE(FP16, CLWeightsReshapeFixture<half>, framework::DatasetMo
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(BFloat16, CLWeightsReshapeFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("InputShape", { TensorShape(9U, 9U, 96U, 240U) }),
+FIXTURE_DATA_TEST_CASE(BFloat16, ClWeightsReshapeFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("InputShape", { TensorShape(9U, 9U, 96U, 240U) }),
                                                                                                                      framework::dataset::make("DataType", DataType::BFLOAT16)),
                                                                                                              framework::dataset::make("HasBias", { false })),
                                                                                                      framework::dataset::make("NumGroups", { 3, 4 })))
@@ -125,7 +125,7 @@ FIXTURE_DATA_TEST_CASE(BFloat16, CLWeightsReshapeFixture<half>, framework::Datas
 TEST_SUITE_END()
 
 TEST_SUITE(Quantized)
-FIXTURE_DATA_TEST_CASE(QASYMM8, CLWeightsReshapeFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("InputShape", { TensorShape(5U, 5U, 48U, 120U) }),
+FIXTURE_DATA_TEST_CASE(QASYMM8, ClWeightsReshapeFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("InputShape", { TensorShape(5U, 5U, 48U, 120U) }),
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                framework::dataset::make("HasBias", { false })),
                                                                                                        framework::dataset::make("NumGroups", { 1, 2 })))
@@ -134,7 +134,7 @@ FIXTURE_DATA_TEST_CASE(QASYMM8, CLWeightsReshapeFixture<uint8_t>, framework::Dat
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(QASYMM8_SIGNED, CLWeightsReshapeFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("InputShape", { TensorShape(5U, 5U, 48U, 120U) }),
+FIXTURE_DATA_TEST_CASE(QASYMM8_SIGNED, ClWeightsReshapeFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("InputShape", { TensorShape(5U, 5U, 48U, 120U) }),
                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                       framework::dataset::make("HasBias", { false })),
                                                                                                               framework::dataset::make("NumGroups", { 1, 2 })))
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 115f9378c9..196e7edb8c 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,12 +27,10 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
-#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
-#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
+#include "tests/datasets/ActivationFunctionsDataset.h"
 #include "tests/datasets/LargeConvolutionLayerDataset.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/datasets/SmallConvolutionLayerDataset.h"
@@ -50,631 +48,379 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 // *INDENT-OFF*
 // clang-format off
-constexpr AbsoluteTolerance<float> tolerance_f32(0.002f);
 const AbsoluteTolerance<half> tolerance_f16(half(1.f));
 constexpr AbsoluteTolerance<float> tolerance_convolution_layer_f32(0.1f);
 const AbsoluteTolerance<half> tolerance_convolution_layer_f16(half(0.4f));
 RelativeTolerance<half_float::half> rel_tolerance_f16(half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for FP16 data types */
 constexpr float                     tolerance_num   = 0.05f;  /**< Tolerance number */
 constexpr float                     abs_tolerance_convolution_layer_f16   = 2.5f;  /**< Tolerance number */
-constexpr float                      tolerance_num_f16 = 0.15f;                 /**< Tolerance number */
-
-// Input transform
-const auto SmallWinogradInputTransformDatasetNCHW =
-           framework::dataset::concat(datasets::SmallWinogradInputTransformDataset2x2_3x3(),
-           framework::dataset::concat(datasets::SmallWinogradInputTransformDataset2x1_3x1(),
-           framework::dataset::concat(datasets::SmallWinogradInputTransformDataset1x2_1x3(),
-           framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x4_3x3(),
-           framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x1_3x1(),
-           framework::dataset::concat(datasets::SmallWinogradInputTransformDataset1x4_1x3(),
-           framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x4_5x5(),
-           framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x1_5x1(),
-                                      datasets::SmallWinogradInputTransformDataset1x4_1x5()))))))));
-
-const auto SmallWinogradInputTransformDatasetNHWC = framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x4_3x3(),
-                                                    framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x1_3x1(),
-                                                    framework::dataset::concat(datasets::SmallWinogradInputTransformDataset1x4_1x3(),
-                                                    framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x4_5x5(),
-                                                    framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x1_5x1(),
-                                                    framework::dataset::concat(datasets::SmallWinogradInputTransformDataset1x4_1x5(),
-                                                    framework::dataset::concat(datasets::SmallWinogradInputTransformDataset2x1_7x1(),
-                                                                               datasets::SmallWinogradInputTransformDataset1x2_1x7())))))));
-
-const auto SmallWinogradInputTransformDatasetNHWC_FP32 = framework::dataset::concat(SmallWinogradInputTransformDatasetNHWC,
-                                                                                    datasets::SmallWinogradInputTransformDataset2x2_7x7());
-
-const auto LargeWinogradInputTransformDatasetNCHW =
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset2x2_3x3(),
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset2x1_3x1(),
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset1x2_1x3(),
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x4_3x3(),
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x1_3x1(),
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset1x4_1x3(),
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x4_5x5(),
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x1_5x1(),
-                                               datasets::LargeWinogradInputTransformDataset1x4_1x5()))))))));
-
-const auto LargeWinogradInputTransformDatasetNHWC =
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x4_3x3(),
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x4_5x5(),
-           framework::dataset::concat(datasets::LargeWinogradInputTransformDataset4x1_5x1(),
-                                      datasets::LargeWinogradInputTransformDataset1x4_1x5())));
-
-const auto LargeWinogradInputTransformDatasetNHWC_FP32 =
-           framework::dataset::concat(LargeWinogradInputTransformDatasetNHWC,
-                                      (datasets::LargeWinogradInputTransformDataset2x2_7x7()));
-
-// Filter transform
-const auto SmallWinogradFilterTransformDatasetNCHW =
-           framework::dataset::concat(combine(datasets::Small3x3Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 2U), Size2D(4U, 4U) })),
-           framework::dataset::concat(combine(datasets::Small3x1Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 1U), Size2D(4U, 1U) })),
-           framework::dataset::concat(combine(datasets::Small1x3Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 2U), Size2D(1U, 4U) })),
-           framework::dataset::concat(combine(datasets::Small5x5Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })),
-           framework::dataset::concat(combine(datasets::Small5x1Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 1U) })),
-                                      combine(datasets::Small1x5Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 4U) })))))));
-
-const auto SmallWinogradFilterTransformDatasetNHWC_F16 =
-           framework::dataset::concat(combine(datasets::Small3x3Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })),
-           framework::dataset::concat(combine(datasets::Small3x1Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 1U) })),
-           framework::dataset::concat(combine(datasets::Small1x3Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 4U) })),
-           framework::dataset::concat(combine(datasets::Small5x5Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })),
-           framework::dataset::concat(combine(datasets::Small5x1Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 1U) })),
-           framework::dataset::concat(combine(datasets::Small1x5Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 4U) })),
-           framework::dataset::concat(combine(datasets::Small1x7Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 2U) })),
-                                      combine(datasets::Small7x1Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 1U) })))))))));
-
-const auto SmallWinogradFilterTransformDatasetNHWC_F32 =
-           framework::dataset::concat(SmallWinogradFilterTransformDatasetNHWC_F16,
-                                      combine(datasets::Small7x7Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 2U) })));
-
-const auto LargeWinogradFilterTransformDatasetNCHW =
-           framework::dataset::concat(combine(datasets::Large3x3Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 2U), Size2D(4U, 4U) })),
-           framework::dataset::concat(combine(datasets::Large3x1Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 1U), Size2D(4U, 1U) })),
-           framework::dataset::concat(combine(datasets::Large1x3Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 2U), Size2D(1U, 4U) })),
-           framework::dataset::concat(combine(datasets::Large5x5Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })),
-           framework::dataset::concat(combine(datasets::Large5x1Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 1U) })),
-                                      combine(datasets::Large1x5Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 4U) })))))));
-
-const auto LargeWinogradFilterTransformDatasetNHWC_F16 =
-           framework::dataset::concat(combine(datasets::Large3x3Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })),
-           framework::dataset::concat(combine(datasets::Large3x1Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 1U) })),
-           framework::dataset::concat(combine(datasets::Large1x3Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 4U) })),
-           framework::dataset::concat(combine(datasets::Large5x5Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })),
-           framework::dataset::concat(combine(datasets::Large5x1Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 1U) })),
-           framework::dataset::concat(combine(datasets::Large1x5Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 4U) })),
-           framework::dataset::concat(combine(datasets::Large7x1Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 1U) })),
-                                      combine(datasets::Large1x7Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 2U) })))))))));
-
-const auto LargeWinogradFilterTransformDatasetNHWC_F32 =
-           framework::dataset::concat(LargeWinogradFilterTransformDatasetNHWC_F16,
-                                      combine(datasets::Large7x7Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 2U) })));
-
-// Output transform
-const auto SmallWinogradOutputTransformDatasetNCHW = datasets::SmallWinogradOutputTransformDatasetNCHW();
-
-const auto SmallWinogradOutputTransformDatasetNHWC_F16 = datasets::SmallWinogradOutputTransformDatasetNHWC_F16();
-
-const auto SmallWinogradOutputTransformDatasetNHWC_F32 = datasets::SmallWinogradOutputTransformDatasetNHWC_F32();
-
-const auto LargeWinogradOutputTransformDatasetNCHW = datasets::LargeWinogradOutputTransformDatasetNCHW();
-
-const auto LargeWinogradOutputTransformDatasetNHWC_F16 = datasets::LargeWinogradOutputTransformDatasetNHWC_F16();
-
-const auto LargeWinogradOutputTransformDatasetNHWC_F32 = datasets::LargeWinogradOutputTransformDatasetNHWC_F32();
-
-//Activation Functions
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+constexpr float                     tolerance_num_f16 = 0.15f;                 /**< Tolerance number */
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
-    ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.8f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU)
 });
-const auto ActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
+
+const auto ActivationFunctionsSmallDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU)
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.8f, -0.5f)
 });
 
 } // namespace
 
 using namespace arm_compute::misc::shape_calculator;
 
+/*
+    Testing Strategy of CL Winograd:
+        - For nchw and nhwc and for each kernel size, we have a dedicated OpenCL kernel.
+          (except 1xN and Nx1 uses NxN under the hood). Therefore, test cases should be
+          stressed for each of these configurations.
+        - Fp32 and Fp16 kernels are the same. Only the DATA_TYPE build option changes
+          between these two. Because the same kernel is stressed thoroughly for both
+          small and large shapes for Fp32 data type, Fp16 kernels are run on a subset
+          of the shapes, because we get diminishing returns by exhaustively testing the
+          same kernel.
+        - Activations only affect the output stage and it's calculated on the output tile.
+          Exhaustively testing all activations with all the shapes does not provide much
+          value but increases the testing time quite significantly. Therefore, all activations
+          are tested in a subset of the shapes, and for all MxM kernels and data layouts as
+          they represent different OpenCL kernels. (1xM and Mx1 kernels use MxM under the hood).
+*/
 TEST_SUITE(CL)
 TEST_SUITE(Winograd)
 
-TEST_SUITE(InputTransform)
-
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                framework::dataset::make("InputInfo",{
-                                                                                        TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F16),     // F16 not supported
-                                                                                        TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::QASYMM8), // QASYMM8 not supported
-                                                                                        TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32),     // Kernel size not supported
-                                                                                        TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32),     // Strides not supported
-                                                                                        TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32),         // Padding needed
-                                                                                        TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32),     // Padding needed
-                                                                                        TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32)         // Padding needed
-                                                                                    }),
-                                                framework::dataset::make("OutputInfo", {
-                                                                                        TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::F16),
-                                                                                        TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::QASYMM8),
-                                                                                        TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(5U, 1U, 16U, 3U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(4U, 442U, 16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(7U, 320U, 16U, 3U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(37U, 304U, 16U), 1, DataType::F32)
-                                                                                    })),
-                                                framework::dataset::make("WinogradInfo", {
-                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(53U, 21U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(53U, 21U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(53U, 21U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(53U, 21U), PadStrideInfo(2, 1, 1, 1), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(53U, 33U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(34U, 42U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(31U, 37U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)
-                                                                                    })),
-                                                framework::dataset::make("Expected", { false, false, false, false, false, false, false })),
-                                            input_info, output_info, winograd_info, expected)
-{
-    ARM_COMPUTE_EXPECT(bool(CLWinogradInputTransform::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS);
-}
-
-using CLWinogradInputTransformFixtureFP32 = WinogradInputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradInputTransform, float>;
-using CLWinogradInputTransformFixtureFP16 = WinogradInputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradInputTransform, half>;
-
-TEST_SUITE(NCHW)
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixtureFP32, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNCHW,
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                                                                                                     framework::dataset::make("DataType", { DataType::F32 })))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixtureFP32, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNCHW,
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                                                                                                   framework::dataset::make("DataType", { DataType::F32 })))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END() // FP32
-
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixtureFP16, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNCHW,
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                                                                                                     framework::dataset::make("DataType", { DataType::F16 })))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixtureFP16, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNCHW,
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                                                                                                   framework::dataset::make("DataType", { DataType::F16 })))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-TEST_SUITE_END() // FP16
-TEST_SUITE_END() // NCHW
-
-TEST_SUITE(NHWC)
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixtureFP16, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNHWC,
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                                     framework::dataset::make("DataType", { DataType::F16 })))
+TEST_SUITE(ConvolutionLayer)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputInfo", {
+        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16),     // Insufficient padding
+        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),     // Datatype mismatch
+        TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported
+        TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32),     // Padding needed
+        TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)  // Kernel size not supported
+        }),
+    make("WeightsInfo", {
+        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F16),
+        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8),
+        TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
+        TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32),
+        TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
+        }),
+    make("BiasesInfo", {
+        TensorInfo(TensorShape(19U), 1, DataType::F16),
+        TensorInfo(TensorShape(19U), 1, DataType::F32),
+        TensorInfo(TensorShape(21U), 1, DataType::F32),
+        TensorInfo(TensorShape(16U), 1, DataType::F32),
+        TensorInfo(TensorShape(16U), 1, DataType::F32)
+        }),
+    make("OutputInfo", {
+        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F16),
+        TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
+        TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
+        TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32),
+        TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
+        }),
+    make("ConvInfo", {
+        PadStrideInfo(1, 1, 1, 1),
+        PadStrideInfo(1, 1, 1, 1),
+        PadStrideInfo(1, 2, 0, 0),
+        PadStrideInfo(1, 1, 1, 1),
+        PadStrideInfo(1, 1, 1, 0)
+    }),
+    make("Expected", { false, false, false, false, false })),
+    input_info, weights_info, bias_info, output_info, conv_info, expected)
 {
-    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num_f16);
+    ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixtureFP16, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNHWC,
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                                   framework::dataset::make("DataType", { DataType::F16 })))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num_f16);
-}
-TEST_SUITE_END() // FP16
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixtureFP32, framework::DatasetMode::PRECOMMIT, combine(combine(SmallWinogradInputTransformDatasetNHWC_FP32,
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                                     framework::dataset::make("DataType", { DataType::F32 })))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
+DATA_TEST_CASE(SupportedKernels, framework::DatasetMode::ALL, zip(
+    make("WeightsInfo", {
+        // Shapes are always in NCHW format. When layout is NHWC, the shape is permuted
+
+        // Fp32/16, NCHW
+        // 3x1, 1x3, 3x3 --> all TRUE
+        TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+        // 5x1, 1x5, 5x5 --> all TRUE
+        TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+        TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+        // 7x1, 1x7, 7x7
+        // nchw does not support kernels with size 7 --> all FALSE
+        TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+        // unsupported kernel sizes
+        TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+        TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+        // Fp32/16, NHWC
+        // 7x1, 1x7, 7x7 --> all TRUE
+        TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+        TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+
+        // 3x1, 1x3, 3x3 --> all TRUE
+        TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+        TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+
+        // 5x1, 1x5, 5x5 --> all TRUE
+        TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+
+        // unsupported kernel sizes
+        TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+        TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+
+        }),
+    make("Expected", {
+        true, true, true,     // nchw, 3x3, 1x3, 3x1
+        true, true, true,     // nchw, 5x5, 1x5, 5x1
+        false, false, false,  // nchw, 7x7, 1x7, 7x1
+        false, false, false,  // nchw, random unsupported kernels
+        true, true, true,     // nhwc, 7x7, 1x7, 7x1
+        true, true, true,     // nhwc, 3x3, 1x3, 3x1
+        true, true, true,     // nhwc, 5x5, 1x5, 5x1
+        false, false, false,  // nchw, random unsupported kernels
+    })),
+    weights_info_const, expected)
+{
+    DataType data_type = weights_info_const.data_type();
+    DataLayout data_layout = weights_info_const.data_layout();
+
+    TensorInfo input_info = TensorInfo(TensorShape(17U, 31U, 2U), 1, data_type);
+    TensorInfo bias_info = TensorInfo(TensorShape(8U), 1, data_type);
+    TensorInfo weights_info = weights_info_const;
+
+    if(data_layout == DataLayout::NHWC)
+    {
+        // Convert to NHWC
+        PermutationVector perm = PermutationVector(2U, 0U, 1U);
+
+        TensorShape input_shape = input_info.tensor_shape();
+        TensorShape weights_shape = weights_info.tensor_shape();
+        permute(input_shape, perm);
+        permute(weights_shape, perm);
+
+        input_info.set_tensor_shape(input_shape);
+        weights_info.set_tensor_shape(weights_shape);
+
+        input_info.set_data_layout(data_layout);
+        weights_info.set_data_layout(data_layout);
+        bias_info.set_data_layout(data_layout);
+    }
+
+    PadStrideInfo conv_info(1, 1, 0, 0);
+
+    TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, conv_info);
+    TensorInfo output_info = TensorInfo(output_shape, 1, data_type, data_layout);
+
+    Status status = CLWinogradConvolutionLayer::validate(
+        &input_info,
+        &weights_info,
+        &bias_info,
+        &output_info,
+        conv_info,
+        ActivationLayerInfo(),
+        true /* fast math */);
+
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixtureFP32, framework::DatasetMode::NIGHTLY, combine(combine(LargeWinogradInputTransformDatasetNHWC_FP32,
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                                   framework::dataset::make("DataType", { DataType::F32 })))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END() // FP32
-TEST_SUITE_END() // NHWC
-TEST_SUITE_END() // InputTransform
-
-TEST_SUITE(FilterTransform)
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                framework::dataset::make("InputInfo",{
-                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 3U), 1, DataType::F16),     // F16 supported
-                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 3U), 1, DataType::QASYMM8), // QASYMM8 not supported
-                                                                                        TensorInfo(TensorShape(5U, 5U, 5U, 3U), 1, DataType::F32),     // Kernel size not supported
-                                                                                        TensorInfo(TensorShape(3U, 3U), 1, DataType::F32),             // Output tile not supported
-                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 3U), 1, DataType::F32),     // valid
-                                                                                        TensorInfo(TensorShape(3U, 3U, 37U, 2U), 1, DataType::F32),    // valid
-                                                                                        TensorInfo(TensorShape(3U, 3U, 37U, 22U), 1, DataType::F32)    // valid
-                                                                                    }),
-                                                framework::dataset::make("OutputInfo", {
-                                                                                        TensorInfo(TensorShape(3U, 5U, 16U), 1, DataType::F16),
-                                                                                        TensorInfo(TensorShape(3U, 5U, 16U), 1, DataType::QASYMM8),
-                                                                                        TensorInfo(TensorShape(3U, 5U, 16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(1U, 1U, 16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(3U, 5U, 16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(2U, 37U, 16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(22U, 37U, 36U), 1, DataType::F32)
-                                                                                    })),
-                                                framework::dataset::make("WinogradInfo", {
-                                                                                          WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
-                                                                                          WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
-                                                                                          WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
-                                                                                          WinogradInfo(Size2D(3U, 3U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
-                                                                                          WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
-                                                                                          WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
-                                                                                          WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ )
-                                                                                         })),
-                                                framework::dataset::make("Expected", { true, false, false, false, true, true, true })),
-                                            input_info, output_info, winograd_info, expected)
-{
-    ARM_COMPUTE_EXPECT(bool(CLWinogradFilterTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS);
-}
-
-using CLWinogradFilterTransform        = CLSynthetizeFunctionWithZeroConstantBorder<CLWinogradFilterTransformKernel, 0>;
-using CLWinogradFilterTransformFixtureFP32 = WinogradFilterTransformValidationFixture<CLTensor, CLAccessor, CLWinogradFilterTransform, float>;
-using CLWinogradFilterTransformFixtureFP16 = WinogradFilterTransformValidationFixture<CLTensor, CLAccessor, CLWinogradFilterTransform, half>;
-
-TEST_SUITE(NCHW)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixtureFP32, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(SmallWinogradFilterTransformDatasetNCHW,
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("DataType", { DataType::F32 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixtureFP32, framework::DatasetMode::NIGHTLY,
-                       combine(combine(LargeWinogradFilterTransformDatasetNCHW,
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("DataType", { DataType::F32 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END() // FP32
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixtureFP16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(SmallWinogradFilterTransformDatasetNCHW,
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("DataType", { DataType::F16 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixtureFP16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(LargeWinogradFilterTransformDatasetNCHW,
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("DataType", { DataType::F16 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-TEST_SUITE_END() // FP16
-TEST_SUITE_END() // NCHW
-
-TEST_SUITE(NHWC)
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixtureFP16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(SmallWinogradFilterTransformDatasetNHWC_F16,
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                       framework::dataset::make("DataType", { DataType::F16 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num_f16);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixtureFP16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(LargeWinogradFilterTransformDatasetNHWC_F16,
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                       framework::dataset::make("DataType", { DataType::F16 })))
+using CLWinogradConvolutionLayerFastMathFixture = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
+using CLWinogradConvolutionLayerFastMathMixedDataLayoutFixture = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float, float, true, true>;
+TEST_SUITE(Conv3x3)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                               make("DataType", { DataType::F32 }),
+                                               ActivationFunctionsSmallDataset,
+                                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num_f16);
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
-TEST_SUITE_END() // FP16
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixtureFP32, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(SmallWinogradFilterTransformDatasetNHWC_F32,
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                       framework::dataset::make("DataType", { DataType::F32 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32);
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixtureFP32, framework::DatasetMode::NIGHTLY,
-                       combine(combine(LargeWinogradFilterTransformDatasetNHWC_F32,
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                       framework::dataset::make("DataType", { DataType::F32 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END() // FP32
-TEST_SUITE_END() // NHWC
-TEST_SUITE_END() // FilterTransform
-
-TEST_SUITE(OutputTransform)
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-                                                framework::dataset::make("InputInfo",{
-                                                                                        TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F16),      // F16 supported
-                                                                                        TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::QASYMM8),  // QASYMM8 not supported
-                                                                                        TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F32),      // Kernel size not supported
-                                                                                        TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F32),      // Valid
-                                                                                        TensorInfo(TensorShape(13U, 108U, 16U, 4U), 1, DataType::F32),      // Padding needed
-                                                                                        TensorInfo(TensorShape(7U, 20U, 16U, 7U), 1, DataType::F32),        // Valid
-                                                                                        TensorInfo(TensorShape(7U, 20U, 16U, 7U), 1, DataType::F32),        // Wrong WinogradInfo
-                                                                                        TensorInfo(TensorShape(7U, 256U, 36U, 3U), 1, DataType::F32),       // Valid
-                                                                                        TensorInfo(TensorShape(7U, 256U, 16U, 3U), 1, DataType::F32)        // Wrong number of batches
-                                                                                    }),
-                                                framework::dataset::make("BiasInfo", {
-                                                                                        TensorInfo(TensorShape(512U), 1, DataType::F16),
-                                                                                        TensorInfo(TensorShape(512U), 1, DataType::QASYMM8),
-                                                                                        TensorInfo(TensorShape(512U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(512U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(13U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(7U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(7U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(7U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(7U), 1, DataType::F32)
-                                                                                    })),
-                                                framework::dataset::make("OutputInfo", {
-                                                                                        TensorInfo(TensorShape(14U, 14U, 512U, 5U), 1, DataType::F16),
-                                                                                        TensorInfo(TensorShape(14U, 14U, 512U, 5U), 1, DataType::QASYMM8),
-                                                                                        TensorInfo(TensorShape(14U, 14U, 512U, 5U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(14U, 14U, 512U, 5U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(17U, 23U, 13U, 4U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(8U, 10U, 7U, 7U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(7U, 9U, 7U, 7U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(64U, 64U, 7U, 3U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(64U, 64U, 7U, 3U), 1, DataType::F32)
-                                                                                    })),
-                                                framework::dataset::make("WinogradInfo", {
-                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(5U, 5U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(17U, 23U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(2U, 3U), Size2D(3U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(64U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
-                                                                                        WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(64U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)
-                                                                                    })),
-                                                framework::dataset::make("Expected", { true, false, false, true, false, true, false, true, false })),
-                                            input_info, bias_info, output_info, winograd_info, expected)
-{
-    ARM_COMPUTE_EXPECT(bool(CLWinogradOutputTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS);
-}
-
-using CLWinogradOutputTransform        = CLSynthetizeFunctionWithZeroConstantBorder<CLWinogradOutputTransformKernel, 0>;
-using CLWinogradOutputTransformFixtureFP32 = WinogradOutputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradOutputTransform, float>;
-using CLWinogradOutputTransformFixtureFP16 = WinogradOutputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradOutputTransform, half>;
-
-TEST_SUITE(NCHW)
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP16, framework::DatasetMode::ALL,
-                       combine(combine(SmallWinogradOutputTransformDatasetNCHW,
-                               framework::dataset::make("DataType", { DataType::F16 })),
-                               framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(8U, 8U, 32U)),
+                            make("Weight", TensorShape(3U, 3U, 32U, 4U)),
+                            make("Bias", TensorShape(4U)),
+                            make("Output", TensorShape(6U, 6U, 4U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16);
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
+TEST_SUITE_END() // Conv3x3
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(LargeWinogradOutputTransformDatasetNCHW,
-                               framework::dataset::make("DataType", { DataType::F16 })),
-                               framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-TEST_SUITE_END() // FP16
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP32, framework::DatasetMode::ALL,
-                       combine(combine(SmallWinogradOutputTransformDatasetNCHW,
-                               framework::dataset::make("DataType", { DataType::F32 })),
-                               framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
+TEST_SUITE(Conv3x1)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32);
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP32, framework::DatasetMode::NIGHTLY,
-                       combine(combine(LargeWinogradOutputTransformDatasetNCHW,
-                               framework::dataset::make("DataType", { DataType::F32 })),
-                               framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32);
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
-TEST_SUITE_END() // FP32
-TEST_SUITE_END() // NCHW
+TEST_SUITE_END() // Conv3x1
 
-TEST_SUITE(NHWC)
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP16, framework::DatasetMode::ALL,
-                       combine(combine(SmallWinogradOutputTransformDatasetNHWC_F16,
-                               framework::dataset::make("DataType", { DataType::F16 })),
-                               framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
+TEST_SUITE(Conv1x3)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num_f16);
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(LargeWinogradOutputTransformDatasetNHWC_F16,
-                               framework::dataset::make("DataType", { DataType::F16 })),
-                               framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLWinogradConvolutionLayerFastMathMixedDataLayoutFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(
+                            make("Input", TensorShape(8U, 8U, 32U)),
+                            make("Weight", TensorShape(1U, 3U, 32U, 1U)),
+                            make("Bias", TensorShape(1U)),
+                            make("Output", TensorShape(8U, 6U, 1U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num_f16);
-}
-TEST_SUITE_END() // FP16
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP32, framework::DatasetMode::ALL,
-                       combine(combine(SmallWinogradOutputTransformDatasetNHWC_F32,
-                               framework::dataset::make("DataType", { DataType::F32 })),
-                               framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32);
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP32, framework::DatasetMode::NIGHTLY,
-                       combine(combine(LargeWinogradOutputTransformDatasetNHWC_F32,
-                               framework::dataset::make("DataType", { DataType::F32 })),
-                               framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END() // FP32
-TEST_SUITE_END() // NHWC
-TEST_SUITE_END() // OutputTransform
-
-TEST_SUITE(ConvolutionLayer)
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-                                                framework::dataset::make("InputInfo", {
-                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16),     // Insufficient padding
-                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),     // Datatype mismatch
-                                                                                        TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported
-                                                                                        TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32),     // Padding needed
-                                                                                        TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)  // Kernel size not supported
-                                                                                      }),
-                                                framework::dataset::make("WeightsInfo", {
-                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F16),
-                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8),
-                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
-                                                                                        })),
-                                                framework::dataset::make("BiasesInfo", {
-                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F16),
-                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(21U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(16U), 1, DataType::F32)
-                                                                                       })),
-                                                framework::dataset::make("OutputInfo", {
-                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F16),
-                                                                                        TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
-                                                                                       })),
-                                                framework::dataset::make("ConvInfo", {
-                                                                                        PadStrideInfo(1, 1, 1, 1),
-                                                                                        PadStrideInfo(1, 1, 1, 1),
-                                                                                        PadStrideInfo(1, 2, 0, 0),
-                                                                                        PadStrideInfo(1, 1, 1, 1),
-                                                                                        PadStrideInfo(1, 1, 1, 0)
-                                                                                                                 })),
-                                                framework::dataset::make("Expected", { false, false, false, false, false })),
-               input_info, weights_info, bias_info, output_info, conv_info, expected)
-{
-    ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS);
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
+TEST_SUITE_END() // Conv1x3
 
-TEST_SUITE(FP32)
-using CLWinogradConvolutionLayerFastMathFixture = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
-TEST_SUITE(Conv3x3)
+TEST_SUITE(Conv5x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
-}
-TEST_SUITE_END() // Conv3x3
+                       combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
-TEST_SUITE(Conv3x1)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsSmallDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(13U, 13U, 32U)),
+                            make("Weight", TensorShape(5U, 5U, 32U, 4U)),
+                            make("Bias", TensorShape(4U)),
+                            make("Output", TensorShape(9U, 9U, 4U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
-TEST_SUITE_END() // Conv3x1
+TEST_SUITE_END() // Conv5x5
 
-TEST_SUITE(Conv1x3)
+TEST_SUITE(Conv5x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsSmallDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
-TEST_SUITE_END() // Conv1x3
+TEST_SUITE_END() // Conv5x1
 
-TEST_SUITE(Conv5x5)
+TEST_SUITE(Conv1x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsSmallDataset ),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -682,64 +428,63 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, fram
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsDataset ),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
-TEST_SUITE_END() // Conv5x5
+TEST_SUITE_END() // Conv1x5
 
-TEST_SUITE(Conv5x1)
+TEST_SUITE(Conv1x7)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
-
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(13U, 13U, 32U)),
+                            make("Weight", TensorShape(1U, 7U, 32U, 4U)),
+                            make("Bias", TensorShape(4U)),
+                            make("Output", TensorShape(13U, 11U, 4U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 2)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
-TEST_SUITE_END() // Conv5x1
+TEST_SUITE_END() // Conv1x7
 
-TEST_SUITE(Conv1x5)
+TEST_SUITE(Conv7x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
+                            make("DataType", { DataType::F32 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
 }
+TEST_SUITE_END() // Conv7x1
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+/** @note: Although 7x7 is in the kernels, reference implementation
+ *  does not support it. So, it remains as a "test gap".
+ */
 
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
-}
-TEST_SUITE_END() // Conv1x5
 TEST_SUITE_END() // FP32
 
 
@@ -748,20 +493,36 @@ TEST_SUITE(FP16)
 using CLWinogradConvolutionLayerFastMathFixture16 = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, half, float>;
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16, tolerance_num_f16);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x3DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(8U, 8U, 32U)),
+                            make("Weight", TensorShape(3U, 3U, 32U, 6U)),
+                            make("Bias", TensorShape(6U)),
+                            make("Output", TensorShape(6U, 6U, 6U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
@@ -770,20 +531,20 @@ TEST_SUITE_END() // Conv3x3
 
 TEST_SUITE(Conv3x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16, tolerance_num_f16);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x1DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
@@ -792,20 +553,20 @@ TEST_SUITE_END() // Conv3x1
 
 TEST_SUITE(Conv1x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16, tolerance_num_f16);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
-                                       framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x3DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
@@ -814,10 +575,10 @@ TEST_SUITE_END() // Conv1x3
 
 TEST_SUITE(Conv5x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -825,11 +586,27 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x5DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
+}
 
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(13U, 13U, 32U)),
+                            make("Weight", TensorShape(5U, 5U, 32U, 6U)),
+                            make("Bias", TensorShape(6U)),
+                            make("Output", TensorShape(9U, 9U, 6U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
@@ -838,10 +615,10 @@ TEST_SUITE_END() // Conv5x5
 
 TEST_SUITE(Conv5x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -849,10 +626,10 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x1DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -862,10 +639,10 @@ TEST_SUITE_END() // Conv5x1
 
 TEST_SUITE(Conv1x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -873,10 +650,10 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x5DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -886,10 +663,10 @@ TEST_SUITE_END() // Conv1x5
 
 TEST_SUITE(Conv1x7)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsSmallDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -897,19 +674,47 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x7Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                               ActivationFunctionsDataset),
-                                               framework::dataset::make("DataLayout", { DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x7DatasetFp16Subset(),
+                            make("DataType", { DataType::F16 }),
+                            make("ActivationInfo", { ActivationLayerInfo() }),
+                            make("DataLayout", { DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
+}
 
+FIXTURE_DATA_TEST_CASE(RunActivations, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
+                       combine(
+                            make("Input", TensorShape(13U, 13U, 32U)),
+                            make("Weight", TensorShape(1U, 7U, 32U, 6U)),
+                            make("Bias", TensorShape(6U)),
+                            make("Output", TensorShape(13U, 7U, 6U)),
+                            make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                            make("Dilation", Size2D(1U, 1U)),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_convolution_layer_f16);
 }
 TEST_SUITE_END() // Conv1x7
 
-TEST_SUITE_END() // FP16
+TEST_SUITE(Conv7x1)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
+                            make("DataType", { DataType::F16 }),
+                            ActivationFunctionsSmallDataset,
+                            make("DataLayout", { DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f16, tolerance_num_f16);
+}
+TEST_SUITE_END() // Conv7x1
 
+TEST_SUITE_END() // FP16
 TEST_SUITE_END() // ConvolutionLayer
 TEST_SUITE_END() // Winograd
 TEST_SUITE_END() // CL
diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt
new file mode 100644
index 0000000000..448e96c4f9
--- /dev/null
+++ b/tests/validation/CMakeLists.txt
@@ -0,0 +1,146 @@
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+target_sources(
+  arm_compute_validation
+  PRIVATE UNIT/SafeIntegerOps.cpp
+          UNIT/Version.cpp
+          UNIT/TensorInfo.cpp
+          UNIT/TensorShape.cpp
+          UNIT/Utils.cpp
+          UNIT/SubTensorInfo.cpp
+          UNIT/WindowIterator.cpp
+          UNIT/LifetimeManager.cpp
+          UNIT/GPUTarget.cpp
+          CPP/DetectionPostProcessLayer.cpp
+          CPP/TopKV.cpp
+          CPP/DFT.cpp
+          CPP/Permute.cpp
+          CPP/NonMaximumSuppression.cpp)
+
+if(ENABLE_NEON)
+  target_sources(
+    arm_compute_validation
+    PRIVATE NEON/ElementwiseNegation.cpp
+            NEON/BoundingBoxTransform.cpp
+            NEON/ChannelShuffle.cpp
+            NEON/Logical.cpp
+            NEON/DilatedConvolutionLayer.cpp
+            NEON/PoolingLayer.cpp
+            NEON/BitwiseNot.cpp
+            NEON/FillBorder.cpp
+            NEON/ElementwiseRsqrtLayer.cpp
+            NEON/DepthConcatenateLayer.cpp
+            NEON/ElementwisePower.cpp
+            NEON/Fill.cpp
+            NEON/ROIPoolingLayer.cpp
+            NEON/LSTMLayer.cpp
+            NEON/ArithmeticSubtraction.cpp
+            NEON/GEMMLowp.cpp
+            NEON/Unstack.cpp
+            NEON/Slice.cpp
+            NEON/Pooling3dLayer.cpp
+            NEON/BitwiseOr.cpp
+            NEON/HeightConcatenateLayer.cpp
+            NEON/ReshapeLayer.cpp
+            NEON/SoftmaxLayer.cpp
+            NEON/Gather.cpp
+            NEON/CropResize.cpp
+            NEON/ReductionOperation.cpp
+            NEON/PixelWiseMultiplication.cpp
+            NEON/LogSoftmaxLayer.cpp
+            NEON/DepthConvertLayer.cpp
+            NEON/Flatten.cpp
+            NEON/ElementwiseKernelSelection.cpp
+            NEON/DepthToSpaceLayer.cpp
+            NEON/ElementwiseAbsoluteValue.cpp
+            NEON/PadLayer.cpp
+            NEON/MeanStdDevNormalizationLayer.cpp
+            NEON/GlobalPoolingLayer.cpp
+            NEON/RNNLayer.cpp
+            NEON/DetectionPostProcessLayer.cpp
+            NEON/ElementwiseRound.cpp
+            NEON/BitwiseXor.cpp
+            NEON/GEMM.cpp
+            NEON/FuseBatchNormalization.cpp
+            NEON/BitwiseAnd.cpp
+            NEON/ElementwiseMax.cpp
+            NEON/ReduceMean.cpp
+            NEON/Reverse.cpp
+            NEON/L2NormalizeLayer.cpp
+            NEON/Convolution3D.cpp
+            NEON/ArithmeticAddition.cpp
+            NEON/ActivationLayer.cpp
+            NEON/SpaceToBatchLayer.cpp
+            NEON/ElementwiseLog.cpp
+            NEON/LSTMLayerQuantized.cpp
+            NEON/Im2Col.cpp
+            NEON/DequantizationLayer.cpp
+            NEON/DeconvolutionLayer.cpp
+            NEON/Select.cpp
+            NEON/ElementwiseSin.cpp
+            NEON/PReluLayer.cpp
+            NEON/BatchNormalizationLayer.cpp
+            NEON/ElementwiseMin.cpp
+            NEON/InstanceNormalizationLayer.cpp
+            NEON/ROIAlignLayer.cpp
+            NEON/ElementwiseDivision.cpp
+            NEON/ElementwiseExpLayer.cpp
+            NEON/ArgMinMax.cpp
+            NEON/QLSTMLayerNormalization.cpp
+            NEON/Col2Im.cpp
+            NEON/Split.cpp
+            NEON/Transpose.cpp
+            NEON/GenerateProposalsLayer.cpp
+            NEON/StackLayer.cpp
+            NEON/WidthConcatenateLayer.cpp
+            NEON/NormalizationLayer.cpp
+            NEON/Copy.cpp
+            NEON/ElementwiseSquareDiff.cpp
+            NEON/MaxUnpoolingLayer.cpp
+            NEON/Permute.cpp
+            NEON/Comparisons.cpp
+            NEON/BatchConcatenateLayer.cpp
+            NEON/Tile.cpp
+            NEON/BatchToSpaceLayer.cpp
+            NEON/SpaceToDepthLayer.cpp
+            NEON/DepthwiseConvolutionLayerNative.cpp
+            NEON/QuantizationLayer.cpp
+            NEON/ConvertFullyConnectedWeights.cpp
+            NEON/Floor.cpp
+            NEON/FFT.cpp
+            NEON/Cast.cpp
+            NEON/PriorBoxLayer.cpp
+            NEON/Scale.cpp
+            NEON/ReorgLayer.cpp
+            NEON/Range.cpp
+            NEON/DirectConvolutionLayer.cpp
+            NEON/DepthwiseConvolutionLayer.cpp
+            NEON/FullyConnectedLayer.cpp
+            NEON/ConvolutionLayer.cpp
+            NEON/StridedSlice.cpp
+            NEON/ReorderLayer.cpp
+            NEON/UNIT/DynamicTensor.cpp
+            NEON/UNIT/TensorAllocator.cpp
+            NEON/UNIT/MemoryManager.cpp
+            NEON/UNIT/RuntimeContext.cpp)
+endif()
diff --git a/tests/validation/CPP/DFT.cpp b/tests/validation/CPP/DFT.cpp
index e19e850589..84431399be 100644
--- a/tests/validation/CPP/DFT.cpp
+++ b/tests/validation/CPP/DFT.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -125,7 +125,7 @@ DATA_TEST_CASE(Real, framework::DatasetMode::ALL, shapes_2d_dft,
     auto backward = reference::ridft_2d(forward, is_odd);
 
     // Validate with input
-    validate(SimpleTensorAccessor<float>(src), backward, RelativeTolerance<float>(0.1f));
+    validate(SimpleTensorAccessor<float>(src), backward, RelativeTolerance<float>(0.1f), 0.f, AbsoluteTolerance<float>(0.001f));
 }
 
 DATA_TEST_CASE(Complex, framework::DatasetMode::ALL, shapes_2d_dft,
diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index eb8bdcf5a7..560460fd33 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,12 @@
  * SOFTWARE.
  */
 #include "tests/validation/Helpers.h"
+#include "tests/framework/Asserts.h"
 
 #include <algorithm>
 #include <cmath>
+#include <cstdint>
+#include <tuple>
 
 namespace arm_compute
 {
@@ -32,82 +35,6 @@ namespace test
 {
 namespace validation
 {
-void fill_mask_from_pattern(uint8_t *mask, int cols, int rows, MatrixPattern pattern)
-{
-    unsigned int                v = 0;
-    std::mt19937                gen(library->seed());
-    std::bernoulli_distribution dist(0.5);
-
-    for(int r = 0; r < rows; ++r)
-    {
-        for(int c = 0; c < cols; ++c, ++v)
-        {
-            uint8_t val = 0;
-
-            switch(pattern)
-            {
-                case MatrixPattern::BOX:
-                    val = 255;
-                    break;
-                case MatrixPattern::CROSS:
-                    val = ((r == (rows / 2)) || (c == (cols / 2))) ? 255 : 0;
-                    break;
-                case MatrixPattern::DISK:
-                    val = (((r - rows / 2.0f + 0.5f) * (r - rows / 2.0f + 0.5f)) / ((rows / 2.0f) * (rows / 2.0f)) + ((c - cols / 2.0f + 0.5f) * (c - cols / 2.0f + 0.5f)) / ((cols / 2.0f) *
-                            (cols / 2.0f))) <= 1.0f ? 255 : 0;
-                    break;
-                case MatrixPattern::OTHER:
-                    val = (dist(gen) ? 0 : 255);
-                    break;
-                default:
-                    return;
-            }
-
-            mask[v] = val;
-        }
-    }
-
-    if(pattern == MatrixPattern::OTHER)
-    {
-        std::uniform_int_distribution<uint8_t> distribution_u8(0, ((cols * rows) - 1));
-        mask[distribution_u8(gen)] = 255;
-    }
-}
-
-HarrisCornersParameters harris_corners_parameters()
-{
-    HarrisCornersParameters params;
-
-    std::mt19937                           gen(library->seed());
-    std::uniform_real_distribution<float>  threshold_dist(0.f, 0.001f);
-    std::uniform_real_distribution<float>  sensitivity(0.04f, 0.15f);
-    std::uniform_real_distribution<float>  euclidean_distance(0.f, 30.f);
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-
-    params.threshold             = threshold_dist(gen);
-    params.sensitivity           = sensitivity(gen);
-    params.min_dist              = euclidean_distance(gen);
-    params.constant_border_value = int_dist(gen);
-
-    return params;
-}
-
-CannyEdgeParameters canny_edge_parameters()
-{
-    CannyEdgeParameters params;
-
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    std::uniform_int_distribution<uint8_t> threshold_dist(2, 255);
-
-    params.constant_border_value = int_dist(gen);
-    params.upper_thresh          = threshold_dist(gen); // upper_threshold >= 2
-    threshold_dist               = std::uniform_int_distribution<uint8_t>(1, params.upper_thresh - 1);
-    params.lower_thresh          = threshold_dist(gen); // lower_threshold >= 1 && lower_threshold < upper_threshold
-
-    return params;
-}
-
 template <>
 SimpleTensor<float> convert_from_asymmetric(const SimpleTensor<uint8_t> &src)
 {
@@ -401,6 +328,249 @@ std::pair<int, int> get_symm_quantized_per_channel_bounds(const QuantizationInfo
     return std::pair<int, int> { min_bound, max_bound };
 }
 
+void add_padding_x(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout, bool only_right_pad)
+{
+    if(data_layout == DataLayout::NHWC)
+    {
+        constexpr unsigned int lower = 1U;
+        constexpr unsigned int upper = 16U;
+
+        std::uniform_int_distribution<unsigned int> distribution(lower, upper);
+        size_t                                      seed_offset = 0;
+
+        for(ITensor *tensor : tensors)
+        {
+            ARM_COMPUTE_ERROR_ON(!tensor->info()->is_resizable());
+
+            std::mt19937 gen(library->seed() + seed_offset++);
+
+            const unsigned int right = distribution(gen);
+            const unsigned int left  = only_right_pad ? 0 : distribution(gen);
+
+            tensor->info()->extend_padding(PaddingSize(0U, right, 0U, left));
+        }
+    }
+}
+
+QuantizationHint suggest_conv_dst_q_info_and_bias(const QuantizationInfo &in_q_info,
+                                                  const QuantizationInfo &weight_q_info,
+                                                  int32_t height,
+                                                  int32_t width,
+                                                  int32_t channels,
+                                                  DataType data_type,
+                                                  float bias_fraction)
+{
+    /**  Quantization Setup of convolution
+     *
+     *  Just like any other multiply-accummulate, convolution (2D) operation
+     *  multiplies and accumulates the input and weight tensors. This operation
+     *  takes place in three dimensions: height, width and channels. All of them
+     *  belong to the weight tensor.
+     *
+     *  The formula for simple convolution can be written as:
+     *      C = sum_h sum_w sum_c(I[h_offset + h, w_offset + w, c] * W[h, w, c])
+     *
+     *  Here, h_offset and w_offset are the starting positions in the image. Effects
+     *  of paddings are ignored. This accumulation reduces to something like
+     *
+     *  C = sum_m(I_index * W_hwc)
+     *      where m is height x width x channels.
+     *
+     *  Non-unit strides and/or dilations do not change the probabilistic nature of
+     *  this sum because we always iterate as the size of the weight tensor.
+     *
+     *  Paddings may affect this summation, but it's a boundary condition and so is
+     *  neglected for brevity.
+     */
+
+    return suggest_mac_dst_q_info_and_bias(in_q_info, weight_q_info, height * width * channels, data_type, bias_fraction);
+}
+
+QuantizationHint suggest_matmul_dst_q_info_and_bias(const QuantizationInfo &lhs_q_info,
+                                                    const QuantizationInfo &rhs_q_info,
+                                                    int32_t m, int32_t n, int32_t k, DataType data_type,
+                                                    float bias_fraction)
+{
+    ARM_COMPUTE_UNUSED(m, n);
+
+    /**  Quantization Setup of matrix multiplication
+     *
+     *  We have a matrix multiplication of the form C = A * B + D
+     *  where A is (m X k), B is (k x n) and C is therefore (m x n).
+     *  The bias, D is (1 x n).
+     *
+     *  If we have some distributional statistics of A, B and D, i.e. mean and variance,
+     *  we can estimate the mean and variance of a single value in C matrix and pick
+     *  good scale and offset values for the output and have non-saturated tests.
+     *
+     *  Each element in the output matrix can be calculated as follows:
+     *      C_ij = sum_k(A_ik * B_kj) + D_j
+     *
+     * Note: All possible A_ik, B_kj, D_j random variables are assumed mutually independent.
+     * Note: In quantized operators, bias is an integer. But, its quantization scale is
+     *       assumed to be equal to lhs_scale * rhs_scale, and offset equal to 0.
+     * Note: Since, bias is an integer that should be given as input, we need to pick responsible
+     *       values when adding it on top of the summation. This is where "bias_fraction" comes
+     *       into play. Based on the fraction given, we also return suggested bias range (min/max)
+     *       for not saturating the output.
+     *
+     * Because all random variables are mutually independent, any C_ij has the same statistics,
+     * which is why we return a single destination quantization info object; which is why we can
+     * resort to a more general calculation explained in suggest_mac_dst_q_info_and_bias().
+     *
+     * From a probabilistic perspective, the above calculation reduces to
+     *      c = sum_k (a_k * b_k) + d
+     */
+
+    return suggest_mac_dst_q_info_and_bias(lhs_q_info, rhs_q_info, k, data_type, bias_fraction);
+}
+
+QuantizationHint suggest_mac_dst_q_info_and_bias(
+    const QuantizationInfo &a_q_info, const QuantizationInfo &b_q_info, int32_t K, DataType data_type, float bias_fraction, int num_sd)
+{
+    QuantizationInfo c_q_info;
+
+    ARM_COMPUTE_ASSERT(data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED);
+
+    const int32_t t_max = static_cast<int32_t>(data_type == DataType::QASYMM8 ? std::numeric_limits<uint8_t>::max() : std::numeric_limits<int8_t>::max());
+    const int32_t t_min = static_cast<int32_t>(data_type == DataType::QASYMM8 ? std::numeric_limits<uint8_t>::min() : std::numeric_limits<int8_t>::min());
+
+    /**  Quantization Setup of multiply-accummulate
+     *
+     * Expression (in float):
+     *    C = sum_k ( A_k * B_k ) + D
+     *
+     * Lemma: An affine transformation (i.e. aX + b) to a discrete uniform random variable
+     *        creates another discrete uniform random variable.
+     *
+     * Terminology:
+     *  E[X]: Mean of the random variable X (sometimes referred as mu_x)
+     *  var(X): Variance of the random variable X (someimes referred as sigma^2_x)
+     *  std(X): sqrt(var(X)), standard deviation of X
+     *
+     * 1) Calculate the mean:
+     *      E[C] = sum_k( E[A_k] * E[B_k] ) + D = K * mean_a * mean_b + mean_d
+     *
+     *      Since elements of A and B are uniformly distributed random variables, we have
+     *          mean_a = (max_a + min_a) / 2, mean_b = (max_b + min_b ) / 2
+     *          max_a and min_a can be calculated with the scale_a/b and offset_a/b
+     *              by replacing data type minimum and maximums in the equations
+     *
+     *    We don't know mean_d because we have to choose it based on bias_fraction. If we call
+     *    the summation as M_int, similar to above, we have:
+     *
+     *      E[C_int] = sum_k( E[A_k_int] * E[B_k_int] ) + E[D_int] = K * mean_a_int * mean_b_int + mean_d_int
+     *                  \___________________________/
+     *                             E[M_int]
+     *
+     *      We choose a bias mean proportional to the integer summation. This proportion is "bias_fraction".
+     *      So, we have D_int = f * M_int (f: fraction), and
+     *          E[D_int] = mean_d_int = f * E[M_int]
+     *
+     *      This also means, for floating point value of D, the following:
+     *          E[D] = mean_d = E[D_int] * a_scale * b_scale
+     *
+     * 2) Calculate the variance:
+     *      var(C)    = sum_k( var(A_k * B_k) ) + var(D)
+     *                = sum_k ( E[A_k^2 * B_k^2] - E[A_k]^2E[B_k^2] )
+     *                = ...
+     *                = K * (var_a * var_b + var_a * mean^2_b + var_b * mean^2_a) + var_d
+     *
+     *      Similarly, due to uniform random variable properties, we have
+     *          var_a = (max_a - min_a)^2 / 12
+     *          var_b = (max_b - min_b)^2 / 12
+     *
+     *      Again, we don't know var_d as we don't know the bias. As set out in the previous section, we have
+     *              var(D_int) = var(f * M_int) = f^2 * var(M_int)
+     *
+     *      Using the same expression, we can find var(M_int):
+     *      var(C_int)    = sum_k( var(A_k_int * B_k_int) ) + var(D_int)
+     *                    = sum_k ( E[A_k_int^2 * B_k_int^2] - E[A_k_int]^2E[B_k_int^2] )
+     *                    = ...
+     *                    = K * (var_a_int * var_b_int + var_a_int * mean^2_b_int + var_b_int * mean^2_a_int) + var_d_int
+     *                      \_______________________________________________________________________________/
+     *                                                          var(M_int)
+     *
+     *      Now, we know mean and variance of D_int, we can return a suitable bias range with
+     *          [mean_d_int +/- 2 * std_d_int]
+     *
+     *      This also means, for floating point value of D, the following:
+     *          var(D) = var_d = var(D_int) * a_scale^2 * b_scale^2
+     *
+     *      E[D] and var(D) calculated in steps (1) and (2) can be substituted into E[C] and var(C) calculatons.
+     *
+     * 3) Now, we have an idea of what would an average C will look like and how much deviation
+     *    is present around it. The exact distribution of C is difficult to come up with dependent on K.
+     *    But, as K increases, due to Central Limit Theorem, it'll look more like a bell shaped figure,
+     *    approaching normal distribution.
+     *
+     *    This is useful because, in normal distribution, we know that values +- 2 std_deviation around
+     *    the mean constitute 95% of the values. Therefore, setting a plausible range for us:
+     *      C_range = [C_min, C_max] = [mean_c - 2 * std_c, mean_c + 2 * std_c]
+     *
+     * 4)
+     *    If we map this [C_min, C_max] to [0, 255] or [-128, 127] depending on the signedness of the
+     *    data type, we can find a suitable scale and offset for the output. On average, it's expected
+     *    that 5% of the output values will saturate and 95% will remain in the range.
+     *
+     *    The equations to be solved for offset_c and scale_c are:
+     *          C_min = scale_c * (type_min - offset_c)
+     *          C_max = scale_c * (type_max - offset_c)
+     */
+
+    const int32_t a_offset = a_q_info.uniform().offset;
+    const float   a_scale  = a_q_info.uniform().scale;
+    const int32_t b_offset = b_q_info.uniform().offset;
+    const float   b_scale  = b_q_info.uniform().scale;
+
+    // Integer value statistics. Valid for both Lhs/A and Rhs/B
+    const float     mean_a_int = (t_max + t_min) / 2.f;
+    constexpr float var_a_int  = (256 * 256 - 1) / 12.f; // Discrete uniform RV variance
+    const float     mean_b_int = mean_a_int;             // A_int and B_int has the same stats
+    constexpr float var_b_int  = var_a_int;
+
+    // Lhs/A stats
+    const float max_a  = (t_max - a_offset) * a_scale;
+    const float min_a  = (t_min - a_offset) * a_scale;
+    const float mean_a = (max_a + min_a) / 2;
+    const float var_a  = (max_a - min_a) * (max_a - min_a) / 12;
+
+    // Rhs/B stats
+    const float max_b  = (t_max - b_offset) * b_scale;
+    const float min_b  = (t_min - b_offset) * b_scale;
+    const float mean_b = (max_b + min_b) / 2;
+    const float var_b  = (max_b - min_b) * (max_b - min_b) / 12;
+
+    // Integer multiplication output/M stats
+    const float mean_m_int = K * mean_a_int * mean_b_int;
+    const float var_m_int  = K * (var_a_int * var_b_int + mean_a_int * var_b_int + mean_b_int + var_a_int);
+    const float std_m_int  = sqrt(var_m_int);
+
+    // Bias/D both Int and Float statistics
+    const float mean_d_int = bias_fraction * mean_m_int;
+    const float std_d_int  = bias_fraction * std_m_int;
+    const float mean_d     = a_scale * b_scale * mean_d_int;
+    const float std_d      = a_scale * b_scale * std_d_int;
+    const float var_d      = std_d * std_d;
+
+    // Also calculate the suggested bias range
+    const int32_t min_bias = mean_d_int - (num_sd * std_d_int);
+    const int32_t max_bias = mean_d_int + (num_sd * std_d_int);
+
+    // Output/C stats
+    const float mean_out = K * mean_a * mean_b + mean_d;
+    const float var_out  = K * (var_a * var_b + var_a * mean_b * mean_b + var_b * mean_a * mean_a) + var_d;
+    const float std_out  = sqrt(var_out);
+
+    // Output quantization setup
+    const float   scale_out  = (2 * num_sd) * std_out / 255;
+    const int32_t offset_out = static_cast<int32_t>(t_min - (mean_out - (num_sd * std_out)) / scale_out);
+
+    c_q_info = QuantizationInfo(scale_out, offset_out);
+
+    return { c_q_info, min_bias, max_bias };
+}
+
 template void get_tile(const SimpleTensor<float> &in, SimpleTensor<float> &roi, const Coordinates &coord);
 template void get_tile(const SimpleTensor<half> &in, SimpleTensor<half> &roi, const Coordinates &coord);
 template void get_tile(const SimpleTensor<int> &in, SimpleTensor<int> &roi, const Coordinates &coord);
@@ -413,6 +583,8 @@ template void transpose_matrix(const SimpleTensor<half> &in, SimpleTensor<half>
 template void transpose_matrix(const SimpleTensor<int> &in, SimpleTensor<int> &out);
 template void transpose_matrix(const SimpleTensor<short> &in, SimpleTensor<short> &out);
 template void transpose_matrix(const SimpleTensor<char> &in, SimpleTensor<char> &out);
+template void transpose_matrix(const SimpleTensor<int8_t> &in, SimpleTensor<int8_t> &out);
+template void transpose_matrix(const SimpleTensor<uint8_t> &in, SimpleTensor<uint8_t> &out);
 template void matrix_multiply(const SimpleTensor<float> &a, const SimpleTensor<float> &b, SimpleTensor<float> &out);
 template void matrix_multiply(const SimpleTensor<half> &a, const SimpleTensor<half> &b, SimpleTensor<half> &out);
 
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index 604840b33e..e044620556 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,16 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_VALIDATION_HELPERS_H
-#define ARM_COMPUTE_TEST_VALIDATION_HELPERS_H
+#ifndef ACL_TESTS_VALIDATION_HELPERS_H
+#define ACL_TESTS_VALIDATION_HELPERS_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "support/Half.h"
 #include "tests/Globals.h"
 #include "tests/SimpleTensor.h"
 
-#include <math.h>
+#include <cmath>
+#include <cstdint>
 #include <random>
 #include <type_traits>
 #include <utility>
@@ -50,6 +53,23 @@ template <>
 struct is_floating_point<half> : public std::true_type
 {
 };
+template <>
+struct is_floating_point<bfloat16> : public std::true_type
+{
+};
+
+/** Helper struct to store the hints for
+ *  - destination quantization info
+ *  - minimum bias value
+ *  - maximum bias value
+ * in quantized test construction.
+ */
+struct QuantizationHint
+{
+    QuantizationInfo q_info;
+    int32_t          bias_min;
+    int32_t          bias_max;
+};
 
 /** Helper function to get the testing range for each activation layer.
  *
@@ -63,13 +83,13 @@ std::pair<T, T> get_activation_layer_test_bounds(ActivationLayerInfo::Activation
 {
     std::pair<T, T> bounds;
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::F16:
         {
             using namespace half_float::literal;
 
-            switch(activation)
+            switch (activation)
             {
                 case ActivationLayerInfo::ActivationFunction::TANH:
                 case ActivationLayerInfo::ActivationFunction::SQUARE:
@@ -89,7 +109,7 @@ std::pair<T, T> get_activation_layer_test_bounds(ActivationLayerInfo::Activation
             break;
         }
         case DataType::F32:
-            switch(activation)
+            switch (activation)
             {
                 case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
                     // Reduce range as exponent overflows
@@ -111,72 +131,6 @@ std::pair<T, T> get_activation_layer_test_bounds(ActivationLayerInfo::Activation
     return bounds;
 }
 
-/** Fill mask with the corresponding given pattern.
- *
- * @param[in,out] mask    Mask to be filled according to pattern
- * @param[in]     cols    Columns (width) of mask
- * @param[in]     rows    Rows (height) of mask
- * @param[in]     pattern Pattern to fill the mask according to
- */
-void fill_mask_from_pattern(uint8_t *mask, int cols, int rows, MatrixPattern pattern);
-
-/** Calculate output tensor shape give a vector of input tensor to concatenate
- *
- * @param[in] input_shapes Shapes of the tensors to concatenate across depth.
- *
- * @return The shape of output concatenated tensor.
- */
-TensorShape calculate_depth_concatenate_shape(const std::vector<TensorShape> &input_shapes);
-
-/** Calculate output tensor shape for the concatenate operation along a given axis
- *
- * @param[in] input_shapes Shapes of the tensors to concatenate across width.
- * @param[in] axis         Axis to use for the concatenate operation
- *
- * @return The shape of output concatenated tensor.
- */
-TensorShape calculate_concatenate_shape(const std::vector<TensorShape> &input_shapes, size_t axis);
-
-/** Parameters of Harris Corners algorithm. */
-struct HarrisCornersParameters
-{
-    float   threshold{ 0.f };           /**< Threshold */
-    float   sensitivity{ 0.f };         /**< Sensitivity */
-    float   min_dist{ 0.f };            /**< Minimum distance */
-    uint8_t constant_border_value{ 0 }; /**< Border value */
-};
-
-/** Generate parameters for Harris Corners algorithm. */
-HarrisCornersParameters harris_corners_parameters();
-
-/** Parameters of Canny edge algorithm. */
-struct CannyEdgeParameters
-{
-    int32_t upper_thresh{ 255 };
-    int32_t lower_thresh{ 0 };
-    uint8_t constant_border_value{ 0 };
-};
-
-/** Generate parameters for Canny edge algorithm. */
-CannyEdgeParameters canny_edge_parameters();
-
-/** Helper function to fill the Lut random by a ILutAccessor.
- *
- * @param[in,out] table Accessor at the Lut.
- *
- */
-template <typename T>
-void fill_lookuptable(T &&table)
-{
-    std::mt19937                                          generator(library->seed());
-    std::uniform_int_distribution<typename T::value_type> distribution(std::numeric_limits<typename T::value_type>::min(), std::numeric_limits<typename T::value_type>::max());
-
-    for(int i = std::numeric_limits<typename T::value_type>::min(); i <= std::numeric_limits<typename T::value_type>::max(); i++)
-    {
-        table[i] = distribution(generator);
-    }
-}
-
 /** Convert an asymmetric quantized simple tensor into float using tensor quantization information.
  *
  * @param[in] src Quantized tensor.
@@ -191,6 +145,7 @@ SimpleTensor<float> convert_from_asymmetric(const SimpleTensor<T> &src);
  * @param[in] src               Float tensor.
  * @param[in] quantization_info Quantification information.
  *
+ * \relates  arm_compute::test::SimpleTensor
  * @return Quantized tensor.
  */
 template <typename T>
@@ -209,7 +164,7 @@ SimpleTensor<float> convert_from_symmetric(const SimpleTensor<T> &src);
  *
  * @param[in] src               Float tensor.
  * @param[in] quantization_info Quantification information.
- *
+ * \relates  arm_compute::test::SimpleTensor
  * @return Quantized tensor.
  */
 template <typename T>
@@ -277,8 +232,83 @@ std::pair<int, int> get_quantized_qasymm8_signed_bounds(const QuantizationInfo &
  * @param[in] max        Floating point maximum value to be quantized
  * @param[in] channel_id Channel id for per channel quantization info.
  */
-std::pair<int, int> get_symm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id = 0);
+std::pair<int, int>
+get_symm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id = 0);
+
+/** Add random padding along the X axis (between 1 and 16 columns per side) to all the input tensors.
+ *  This is used in our validation suite in order to simulate implicit padding addition after configuring, but before allocating.
+ *
+ * @param[in] tensors        List of tensors to add padding to
+ * @param[in] data_layout    (Optional) Data layout of the operator
+ * @param[in] only_right_pad (Optional) Only right padding testing, in case of cl image padding
+ *
+ * @note This function adds padding to the input tensors only if data_layout == DataLayout::NHWC
+ */
+void add_padding_x(std::initializer_list<ITensor *> tensors,
+                   const DataLayout                &data_layout    = DataLayout::NHWC,
+                   bool                             only_right_pad = false);
+
+/** For 2d convolution, given the Lhs/Rhs matrix quantization informations and the convolution dimension,
+ *  calculate a suitable output quantization and suggested bias range for obtaining non-saturated outputs with high probability.
+ *
+ * @param[in] in_q_info     Input matrix quantization info
+ * @param[in] weight_q_info Weights matrix quantization info
+ * @param[in] height        Height of the weights tensor
+ * @param[in] width         Width of the weights tensors
+ * @param[in] channels      Number of input channels
+ * @param[in] data_type     data type, only QASYMM8, QASYMM8_SIGNED are supported
+ * @param[in] bias_fraction see @ref suggest_mac_dst_q_info_and_bias() for explanation
+ *
+ * @return QuantizationHint object containing the suggested output quantization info and min/max bias range
+ */
+QuantizationHint suggest_conv_dst_q_info_and_bias(const QuantizationInfo &in_q_info,
+                                                  const QuantizationInfo &weight_q_info,
+                                                  int32_t                 height,
+                                                  int32_t                 width,
+                                                  int32_t                 channels,
+                                                  DataType                data_type,
+                                                  float                   bias_fraction);
+
+/** For a matrix multiplication, given the Lhs/Rhs matrix quantization informations and the matrix multiplication dimensions,
+ *  calculate a suitable output quantization and suggested bias range for obtaining non-saturated outputs with high probability.
+ *
+ * @param[in] lhs_q_info    Lhs matrix quantization info
+ * @param[in] rhs_q_info    Rhs matrix quantization info
+ * @param[in] m             Number of rows of Lhs matrix
+ * @param[in] n             Number of columns of Rhs Matrix
+ * @param[in] k             Number of rows/columns of Rhs/Lhs Matrix
+ * @param[in] data_type     data type, only QASYMM8, QASYMM8_SIGNED are supported
+ * @param[in] bias_fraction see @ref suggest_mac_dst_q_info_and_bias() for explanation
+ *
+ * @return QuantizationHint object containing the suggested output quantization info and min/max bias range
+ */
+QuantizationHint suggest_matmul_dst_q_info_and_bias(const QuantizationInfo &lhs_q_info,
+                                                    const QuantizationInfo &rhs_q_info,
+                                                    int32_t                 m,
+                                                    int32_t                 n,
+                                                    int32_t                 k,
+                                                    DataType                data_type,
+                                                    float                   bias_fraction);
+
+/** For a multiply-accumulate (mac), given the Lhs/Rhs vector quantization informations and the dot product dimensions,
+ *  calculate a suitable output quantization and suggested bias range for obtaining non-saturated outputs with high probability.
+ *
+ * @param[in] lhs_q_info    Lhs matrix quantization info
+ * @param[in] rhs_q_info    Rhs matrix quantization info
+ * @param[in] k             number of accumulations taking place in the sum, i.e. c_k = sum_k(a_k * b_k)
+ * @param[in] data_type     data type, only QASYMM8, QASYMM8_SIGNED are supported
+ * @param[in] bias_fraction the fraction of bias amplitude compared to integer accummulation.
+ * @param[in] num_sd        (Optional) number of standard deviations we allow from the mean. Default value is 2.
+ *
+ * @return QuantizationHint object containing the suggested output quantization info and min/max bias range
+ */
+QuantizationHint suggest_mac_dst_q_info_and_bias(const QuantizationInfo &lhs_q_info,
+                                                 const QuantizationInfo &rhs_q_info,
+                                                 int32_t                 k,
+                                                 DataType                data_type,
+                                                 float                   bias_fraction,
+                                                 int                     num_sd = 2);
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_VALIDATION_HELPERS_H */
+#endif // ACL_TESTS_VALIDATION_HELPERS_H
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 577603d07d..73f5de68ac 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,13 @@
  */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/RuntimeContext.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuActivationKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ActivationFunctionsDataset.h"
@@ -37,7 +40,8 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ActivationLayerFixture.h"
 
-#include "support/Requires.h"
+#include "arm_compute/Acl.hpp"
+#include "support/AclRequires.h"
 
 namespace arm_compute
 {
@@ -65,14 +69,16 @@ RelativeTolerance<float> relative_tolerance(DataType data_type, ActivationLayerI
         case ActivationLayerInfo::ActivationFunction::SQRT:
         case ActivationLayerInfo::ActivationFunction::TANH:
         case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+        case ActivationLayerInfo::ActivationFunction::SWISH:
+        case ActivationLayerInfo::ActivationFunction::GELU:
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.25f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.1f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return RelativeTolerance<float>(0.05f);
             }
@@ -80,11 +86,11 @@ RelativeTolerance<float> relative_tolerance(DataType data_type, ActivationLayerI
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.9f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.01f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return RelativeTolerance<float>(0.00001f);
             }
@@ -107,15 +113,16 @@ AbsoluteTolerance<float> absolute_tolerance(DataType data_type, ActivationLayerI
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
         case ActivationLayerInfo::ActivationFunction::SQRT:
         case ActivationLayerInfo::ActivationFunction::TANH:
+        case ActivationLayerInfo::ActivationFunction::SWISH:
         case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.25f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.01f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return AbsoluteTolerance<float>(0.00001f);
             }
@@ -123,11 +130,11 @@ AbsoluteTolerance<float> absolute_tolerance(DataType data_type, ActivationLayerI
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.9f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.01f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return AbsoluteTolerance<float>(0.00001f);
             }
@@ -169,7 +176,8 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto NeonActivationFunctionsDataset = concat(datasets::ActivationFunctions(), framework::dataset::make("ActivationFunction", ActivationLayerInfo::ActivationFunction::HARD_SWISH));
+const auto NeonActivationFunctionsDataset = concat(datasets::ActivationFunctions(),
+                                                   framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::SWISH }));
 
 /** Input data sets. */
 const auto ActivationDataset = combine(combine(framework::dataset::make("InPlace", { false, true }), NeonActivationFunctionsDataset), framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
@@ -215,6 +223,48 @@ void test_float_sqrt_boundary_value()
 TEST_SUITE(NEON)
 TEST_SUITE(ActivationLayer)
 
+/** Test case for memory injection in @ref cpu::CpuWinogradConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(ActivationAPI, framework::DatasetMode::ALL)
+{
+    acl::StatusCode err = acl::StatusCode::Success;
+
+    // Create context & Queue
+    acl::Context ctx(acl::Target::Cpu, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    acl::Queue queue(ctx, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    // Create activation operator
+    acl::TensorDescriptor src_info({ 2, 3 }, acl::DataType::Float32);
+    acl::TensorDescriptor dst_info({ 2, 3 }, acl::DataType::Float32);
+    acl::ActivationDesc   desc{ AclRelu, 6.f, 0.f, false };
+
+    acl::Activation act(ctx, src_info, dst_info, desc, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    // Create tensors and feed
+    acl::Tensor src(ctx, src_info, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+    acl::Tensor dst(ctx, dst_info, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    acl::TensorPack pack(ctx);
+    err = pack.add(src, ACL_SRC);
+    err = pack.add(dst, ACL_DST);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    // Execute operator
+    err = act.run(queue, pack);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -236,6 +286,49 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     bool is_valid = bool(NEActivationLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), act_info));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat(
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QSYMM16
+                                                            })),
+                combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                        framework::dataset::make("DataType", { DataType::F32,
+                                                               DataType::F16,
+                                                             }))),
+                combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                        framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                               DataType::QASYMM8_SIGNED,
+                                                               DataType::QSYMM16
+                                                             }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{data_type, CPUModel::GENERIC, cpu_isa,ActivationLayerInfo::ActivationFunction::BOUNDED_RELU}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation";
+    if( data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)
+    {
+#ifdef __aarch64__
+        expected = "neon_q8_activation_lut";
+#else  // __aarch64__
+        expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation";
+#endif // __aarch64__
+    }
+    std::string actual   = selected_impl->name;
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
@@ -316,9 +409,12 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture<int8_t>, fram
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 /** Input data sets. */
-const auto Int16QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::LOGISTIC,
-                                                                                                       ActivationLayerInfo::ActivationFunction::TANH
-                                                                                                     });
+const auto Int16QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction",
+{
+    ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+});
 const auto Int16QuantizedActivationDataset = combine(combine(framework::dataset::make("InPlace", { false }), Int16QuantizedActivationFunctionsDataset),
                                                      framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
 
diff --git a/tests/validation/NEON/AddMulAdd.cpp b/tests/validation/NEON/AddMulAdd.cpp
new file mode 100644
index 0000000000..77e3d80fe6
--- /dev/null
+++ b/tests/validation/NEON/AddMulAdd.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/AddMulAddFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f);     /**< Tolerance for floating point tests */
+const AbsoluteTolerance<half>      tolerance_fp16(half(0.1f)); /**< Tolerance for 16-bit floating point tests */
+constexpr AbsoluteTolerance<float> tolerance_quant(1);         /**< Tolerance for quantized tests */
+
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+
+    // Boundaries are aligned with Quantized Data ranges -- DOUBLE check before changing
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, -2.f)
+});
+
+// QASYMM8 test quantizations
+const auto qasymm8_input1_qinfo_set = framework::dataset::make("Input1QInfo", { QuantizationInfo(0.1, 10) }); // Representable Range: [-1, 24.5]
+const auto qasymm8_input2_qinfo_set = framework::dataset::make("Input2QInfo", { QuantizationInfo(0.2, 60) }); // Representable Range: [-12, 39]
+const auto qasymm8_bn_mul_qinfo_set = framework::dataset::make("BnMulInfo", { QuantizationInfo(0.001, 55) }); // Representable Range: [-0.11, 0.2]
+const auto qasymm8_bn_add_qinfo_set = framework::dataset::make("BnAddInfo", { QuantizationInfo(0.02, 20) });  // Representable Range: [-0.4, 4.7]
+
+// Representable Range: [-9.36, 51.84], Expected F32 range: [-13, 63.5], leaving some space for saturation
+const auto qasymm8_add_output_qinfo_set = framework::dataset::make("AddOutputInfo", { QuantizationInfo(0.24, 39) });
+
+// Representable Range: [-4.8, 10.5], Expected FP32 range: [-6.985, 12.7], leaving some space for saturation
+// This range also makes sense with the activation boundaries above, i.e. [-2, 8] for LU_BOUNDED_RELU and [0, 6] for BOUNDED_RELU
+const auto qasymm8_final_output_qinfo_set = framework::dataset::make("FinalOutputInfo", { QuantizationInfo(0.06, 80) });
+
+// QASYMM8_SIGNED test quantizations
+const auto qasymm8_signed_input1_qinfo_set = framework::dataset::make("Input1QInfo", { QuantizationInfo(0.1, 10) });  // Representable Range: [-13.8, 11.7]
+const auto qasymm8_signed_input2_qinfo_set = framework::dataset::make("Input2QInfo", { QuantizationInfo(0.2, -60) }); // Representable Range: [-13.6, 39.4]
+const auto qasymm8_signed_bn_mul_qinfo_set = framework::dataset::make("BnMulInfo", { QuantizationInfo(0.001, 55) });  // Representable Range: [-0.183, 0.072]
+const auto qasymm8_signed_bn_add_qinfo_set = framework::dataset::make("BnAddInfo", { QuantizationInfo(0.4, -120) });  // Representable Range: [-0.32, 9.08]
+
+// Representable Range: [-21.36, 39.84], Expected F32 range: [-27.4, 51.1], leaving some space for saturation
+const auto qasymm8_signed_add_output_qinfo_set = framework::dataset::make("AddOutputInfo", { QuantizationInfo(0.24, -39) });
+
+// Representable Range: [-4.8, 10.5], Expected FP32 range: [-9.6713, 14.0942], leaving some space for saturation
+// This range also makes sense with the activation boundaries above, i.e. [-2, 8] for LU_BOUNDED_RELU and [0, 6] for BOUNDED_RELU
+const auto qasymm8_signed_final_output_qinfo_set = framework::dataset::make("FinalOutputInfo", { QuantizationInfo(0.06, -48) });
+
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(AddMulAdd)
+
+template <typename T>
+using NEAddMulAddFloatFixture = AddMulAddFloatValidationFixture<Tensor, Accessor, NEAddMulAdd, T, true>;
+
+template <typename T>
+using NEAddMulAddFloatFixtureWoIntermOut = AddMulAddFloatValidationFixture<Tensor, Accessor, NEAddMulAdd, T, false>;
+
+TEST_SUITE(Float)
+
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulAddFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
+                                                                                                            ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+// This test is to stress the case when there is no intermediate output required (i.e. nullptr)
+FIXTURE_DATA_TEST_CASE(RunSmallWithoutIntermOutput, NEAddMulAddFloatFixtureWoIntermOut<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })))
+{
+    // Validate outputs
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulAddFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                                  framework::dataset::make("DataType", DataType::F32)),
+                                                                                                          ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+TEST_SUITE_END() // F32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(F16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulAddFloatFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                                                           ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulAddFloatFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
+                                                                                                         ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+TEST_SUITE_END() // F16
+#endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+TEST_SUITE_END() // Float
+
+template <typename T>
+using NEAddMulQuantizedFixture = AddMulAddQuantizedValidationFixture<Tensor, Accessor, NEAddMulAdd, T, true>;
+
+template <typename T>
+using NEAddMulAddQuantizedFixtureWoIntermOut = AddMulAddQuantizedValidationFixture<Tensor, Accessor, NEAddMulAdd, T, false>;
+
+TEST_SUITE(Quantized)
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                       ActivationFunctionsDataset),
+                                                                                                                       qasymm8_input1_qinfo_set),
+                                                                                                                       qasymm8_input2_qinfo_set),
+                                                                                                                       qasymm8_bn_mul_qinfo_set),
+                                                                                                                       qasymm8_bn_add_qinfo_set),
+                                                                                                                       qasymm8_add_output_qinfo_set),
+                                                                                                               qasymm8_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
+                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                     ActivationFunctionsDataset),
+                                                                                                                     qasymm8_input1_qinfo_set),
+                                                                                                                     qasymm8_input2_qinfo_set),
+                                                                                                                     qasymm8_bn_mul_qinfo_set),
+                                                                                                                     qasymm8_bn_add_qinfo_set),
+                                                                                                                     qasymm8_add_output_qinfo_set),
+                                                                                                             qasymm8_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                      framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                      ActivationFunctionsDataset),
+                                                                                                                      qasymm8_signed_input1_qinfo_set),
+                                                                                                                      qasymm8_signed_input2_qinfo_set),
+                                                                                                                      qasymm8_signed_bn_mul_qinfo_set),
+                                                                                                                      qasymm8_signed_bn_add_qinfo_set),
+                                                                                                                      qasymm8_signed_add_output_qinfo_set),
+                                                                                                              qasymm8_signed_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                    ActivationFunctionsDataset),
+                                                                                                                    qasymm8_signed_input1_qinfo_set),
+                                                                                                                    qasymm8_signed_input2_qinfo_set),
+                                                                                                                    qasymm8_signed_bn_mul_qinfo_set),
+                                                                                                                    qasymm8_signed_bn_add_qinfo_set),
+                                                                                                                    qasymm8_signed_add_output_qinfo_set),
+                                                                                                            qasymm8_signed_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
+
+TEST_SUITE_END() // AddMulAdd
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif // __aarch64__
diff --git a/tests/validation/NEON/ArgMinMax.cpp b/tests/validation/NEON/ArgMinMax.cpp
index 0a4071076a..91b8128dea 100644
--- a/tests/validation/NEON/ArgMinMax.cpp
+++ b/tests/validation/NEON/ArgMinMax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,27 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+const auto OpsDataset   = framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX });
+const auto AxisDataset  = framework::dataset::make("Axis", { 0, 1, 2, 3 });
+const auto QInfoDataset = framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) });
+
+const auto ArgMinMaxSmallDatasetAxis0 = framework::dataset::make("Shape",
+{
+    TensorShape{ 1U, 5U },
+    TensorShape{ 2U, 3U },
+    TensorShape{ 1U },
+    TensorShape{ 3U },
+    TensorShape{ 2U },
+    TensorShape{ 5U },
+    TensorShape{ 17U },
+    TensorShape{ 15U, 2U },
+});
+using ArgMinMaxSmallDataset = datasets::Small4DShapes;
+using ArgMinMaxLargeDataset = datasets::Large4DShapes;
+}
+
 TEST_SUITE(NEON)
 TEST_SUITE(ArgMinMax)
 
@@ -70,23 +91,50 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-template <typename T>
-using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
+template <typename T1, typename T2>
+using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture<Tensor, Accessor, NEArgMinMaxLayer, T1, T2>;
+
+using NEArgMinMaxValidationFixture_S32_S32 = NEArgMinMaxValidationFixture<int32_t, int32_t>;
+using NEArgMinMaxValidationFixture_F16_S32 = NEArgMinMaxValidationFixture<half, int32_t>;
+using NEArgMinMaxValidationFixture_F32_S32 = NEArgMinMaxValidationFixture<float, int32_t>;
+#ifdef __aarch64__
+using NEArgMinMaxValidationFixture_F32_S64 = NEArgMinMaxValidationFixture<float, int64_t>;
+#endif // __aarch64__
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxValidationFixture<int32_t>,
+FIXTURE_DATA_TEST_CASE(RunSmallAxis0,
+                       NEArgMinMaxValidationFixture_S32_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDatasetAxis0,
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       framework::dataset::make("Axis", { 0 })),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEArgMinMaxValidationFixture_S32_S32,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxValidationFixture<int32_t>,
+                       NEArgMinMaxValidationFixture_S32_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -97,18 +145,26 @@ TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxValidationFixture<half>,
+                       NEArgMinMaxValidationFixture_F16_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F16)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxValidationFixture<half>,
+                       NEArgMinMaxValidationFixture_F16_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F16)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -118,18 +174,41 @@ TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxValidationFixture<float>,
+                       NEArgMinMaxValidationFixture_F32_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
+#ifdef __aarch64__
+FIXTURE_DATA_TEST_CASE(RunSmall_F32_S64,
+                       NEArgMinMaxValidationFixture_F32_S64,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S64)),
+                                       AxisDataset),
+                               OpsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+#endif // __aarch64__
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxValidationFixture<float>,
+                       NEArgMinMaxValidationFixture_F32_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -137,27 +216,35 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
-template <typename T>
-using NEArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
+template <typename T1, typename T2>
+using NEArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture<Tensor, Accessor, NEArgMinMaxLayer, T1, T2>;
+
+using NEArgMinMaxQuantizedValidationFixture_U8_S32 = NEArgMinMaxQuantizedValidationFixture<uint8_t, int32_t>;
+using NEArgMinMaxQuantizedValidationFixture_S8_S32 = NEArgMinMaxQuantizedValidationFixture<int8_t, int32_t>;
 
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxQuantizedValidationFixture<uint8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_U8_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxQuantizedValidationFixture<uint8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_U8_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -166,22 +253,27 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxQuantizedValidationFixture<int8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_S8_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 127.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxQuantizedValidationFixture<int8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_S8_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 127.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp
index 98341805ed..535c3e634e 100644
--- a/tests/validation/NEON/ArithmeticAddition.cpp
+++ b/tests/validation/NEON/ArithmeticAddition.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuAddKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ConvertPolicyDataset.h"
@@ -43,31 +46,13 @@ namespace validation
 {
 namespace
 {
-#if !defined(__aarch64__) || defined(__ARM_FEATURE_SVE)
+#if !defined(__aarch64__) || defined(ENABLE_SVE)
 constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
-#else                                                  // !defined(__aarch64__) || defined(__ARM_FEATURE_SVE)
+#else                                                  // !defined(__aarch64__) || defined(ENABLE_SVE)
 constexpr AbsoluteTolerance<float> tolerance_quant(0);
-#endif                                                 // !defined(__aarch64__) || defined(__ARM_FEATURE_SVE)
-
-/** Input data sets **/
-const auto ArithmeticAdditionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
-                                                 DataType::U8));
-const auto ArithmeticAdditionS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
-                                                  framework::dataset::make("DataType", DataType::S16));
-const auto ArithmeticAdditionS32Dataset = combine(combine(framework::dataset::make("DataType", { DataType::S32 }), framework::dataset::make("DataType", DataType::S32)),
-                                                  framework::dataset::make("DataType", DataType::S32));
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-const auto ArithmeticAdditionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
-                                                   framework::dataset::make("DataType", DataType::F16));
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-const auto ArithmeticAdditionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
-                                                   framework::dataset::make("DataType", DataType::F32));
-const auto ArithmeticAdditionQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                      framework::dataset::make("DataType", DataType::QASYMM8));
-const auto ArithmeticAdditionQASYMM8SIGNEDDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                            framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
-const auto ArithmeticAdditionQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16), framework::dataset::make("DataType", DataType::QSYMM16)),
-                                                      framework::dataset::make("DataType", DataType::QSYMM16));
+#endif                                                 // !defined(__aarch64__) || defined(ENABLE_SVE)
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -79,25 +64,22 @@ using NEArithmeticAdditionFixture = ArithmeticAdditionValidationFixture<Tensor,
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8), // Unsupported broadcast
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),// Mismatching shapes
                                                       }),
-               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(1U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false, false})),
+               framework::dataset::make("Expected", { true, false, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     Status s = NEArithmeticAddition::validate(&input1_info.clone()->set_is_resizable(false),
@@ -106,6 +88,63 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                               ConvertPolicy::WRAP);
     ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat(
+                combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::U8,
+                                                              DataType::S16,
+                                                              DataType::S32,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QSYMM16
+                                                            })),
+                        framework::dataset::make("CanUseFixedpoint", {true, false})),
+                combine(combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                        framework::dataset::make("DataType", { DataType::F32,
+                                                               DataType::F16,
+                                                               DataType::U8,
+                                                               DataType::S16,
+                                                               DataType::S32
+                                                             })),
+                        framework::dataset::make("CanUseFixedpoint", {true, false}))),
+                combine(combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                        framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                               DataType::QASYMM8_SIGNED,
+                                                               DataType::QSYMM16
+                                                             })),
+                        framework::dataset::make("CanUseFixedpoint", {true, false}))),
+               cpu_ext, data_type, can_use_fixedpoint)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{data_type, cpu_isa, can_use_fixedpoint}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    bool qasymm8_any = (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED);
+
+    std::string expected;
+    if(qasymm8_any && can_use_fixedpoint)
+    {
+        expected = "neon_" + cpu_impl_dt(data_type) + "_add_fixedpoint";
+    }
+    else
+    {
+        expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_add";
+    }
+
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
@@ -127,8 +166,10 @@ TEST_CASE(NoPaddingAdded, framework::DatasetMode::PRECOMMIT)
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionU8Dataset),
-                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::U8)),
+                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                  OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -136,15 +177,19 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionS16Dataset),
-                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::S16)),
+                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                  OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticAdditionS16Dataset),
-                                                                                                                framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                        DataType::S16)),
+                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -152,8 +197,10 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<int16_t>, framework
 TEST_SUITE_END() // S16
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticAdditionS32Dataset),
-                                                                                                            framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                    DataType::S32)),
+                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                            OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -164,8 +211,9 @@ TEST_SUITE_END() // Integer
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP16Dataset),
-                                                                                                         framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                 framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                         OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -174,15 +222,19 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP32Dataset),
-                                                                                                                framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                        DataType::F32)),
+                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticAdditionFP32Dataset),
-                                                                                                              framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                      DataType::F32)),
+                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                              OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -191,17 +243,19 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<float>, framework::
 template <typename T>
 using NEArithmeticAdditionBroadcastFixture = ArithmeticAdditionBroadcastValidationFixture<Tensor, Accessor, NEArithmeticAddition, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapesBroadcast(),
-                       ArithmeticAdditionFP32Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
-                       ArithmeticAdditionFP32Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -220,11 +274,12 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEArithmeticAdditionQuantizedFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionQASYMM8Dataset),
-                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_quant);
@@ -235,22 +290,24 @@ TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEArithmeticAdditionQuantizedFixture<int8_t>,
                        framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionQASYMM8SIGNEDDataset),
-                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })))
+                       combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_quant);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionQuantizedBroadcastFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(
-                           datasets::SmallShapesBroadcast(), ArithmeticAdditionQASYMM8SIGNEDDataset),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionQuantizedBroadcastFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(
+                           datasets::SmallShapesBroadcast(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })))
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_quant);
@@ -261,11 +318,12 @@ TEST_SUITE(QSYMM16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEArithmeticAdditionQuantizedFixture<int16_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionQSYMM16Dataset),
-                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })))
+                       combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QSYMM16)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_quant);
diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp
index 7a36893445..8886ca2db5 100644
--- a/tests/validation/NEON/ArithmeticSubtraction.cpp
+++ b/tests/validation/NEON/ArithmeticSubtraction.cpp
@@ -50,45 +50,16 @@ constexpr AbsoluteTolerance<float> tolerance_qasymm8(1); /**< Tolerance value fo
 #endif                                                     //__aarch64__
 constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 
-/** Input data sets **/
-const auto ArithmeticSubtractionQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8),
-                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                         framework::dataset::make("DataType", DataType::QASYMM8));
-
-const auto ArithmeticSubtractionQASYMM8SIGNEDDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED),
-                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
-
-const auto ArithmeticSubtractionQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16),
-                                                                 framework::dataset::make("DataType", DataType::QSYMM16)),
-                                                         framework::dataset::make("DataType", DataType::QSYMM16));
-
-const auto ArithmeticSubtractionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8),
-                                                            framework::dataset::make("DataType", DataType::U8)),
-                                                    framework::dataset::make("DataType", DataType::U8));
-
-const auto ArithmeticSubtractionS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }),
-                                                             framework::dataset::make("DataType", DataType::S16)),
-                                                     framework::dataset::make("DataType", DataType::S16));
-
-const auto ArithmeticSubtractionS32Dataset = combine(combine(framework::dataset::make("DataType", DataType::S32),
-                                                             framework::dataset::make("DataType", DataType::S32)),
-                                                     framework::dataset::make("DataType", DataType::S32));
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-const auto ArithmeticSubtractionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16),
-                                                              framework::dataset::make("DataType", DataType::F16)),
-                                                      framework::dataset::make("DataType", DataType::F16));
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-const auto ArithmeticSubtractionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32),
-                                                              framework::dataset::make("DataType", DataType::F32)),
-                                                      framework::dataset::make("DataType", DataType::F32));
-
+// Quantization Infomation DataSet
 const auto ArithmeticSubtractionQuantizationInfoDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(10, 120) }),
                                                                           framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(20, 110) })),
                                                                   framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(15, 125) }));
 const auto ArithmeticSubtractionQuantizationInfoSignedDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.5f, 10) }),
                                                                                 framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.5f, 20) })),
                                                                         framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.5f, 50) }));
+const auto ArithmeticSubtractionQuantizationInfoSignedInPlaceDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.8f, 10) }),
+                                                                                       framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.8f, 10) })),
+                                                                               framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.8f, 10) }));
 const auto ArithmeticSubtractionQuantizationInfoSymmetric = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.3f, 0) }),
                                                                             framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.7f, 0) })),
                                                                     framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.2f, 0) }));
@@ -105,35 +76,31 @@ using NEArithmeticSubtractionFixture = ArithmeticSubtractionValidationFixture<Te
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-        framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+        framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                  TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::QASYMM8), // Mismatching types
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8), // Invalid convert policy
         }),
-        framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+        framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
         })),
-        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
         })),
-        framework::dataset::make("ConvertPolicy",{ ConvertPolicy::WRAP,
-                                                ConvertPolicy::SATURATE,
-                                                ConvertPolicy::SATURATE,
-                                                ConvertPolicy::WRAP,
-                                                ConvertPolicy::WRAP,
-                                                ConvertPolicy::WRAP,
+        framework::dataset::make("ConvertPolicy",{ ConvertPolicy::SATURATE,
+                                                   ConvertPolicy::SATURATE,
+                                                   ConvertPolicy::WRAP,
+                                                   ConvertPolicy::WRAP,
+                                                   ConvertPolicy::WRAP,
         })),
-        framework::dataset::make("Expected", { true, true, false, false, false, false})),
+        framework::dataset::make("Expected", { true, false, false, false, false})),
         input1_info, input2_info, output_info, policy, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NEArithmeticSubtraction::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), policy)) == expected, framework::LogLevel::ERRORS);
@@ -194,7 +161,8 @@ TEST_CASE(InvalidBroadcastBoth, framework::DatasetMode::ALL)
 TEST_SUITE_END() // InPlaceValidate
 
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionU8Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::U8)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                      OutOfPlaceDataSet))
 {
@@ -210,10 +178,11 @@ using NEArithmeticSubtractionQSYMM16Fixture                = ArithmeticSubtracti
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionQASYMM8Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::QASYMM8)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                                                                                                                      ArithmeticSubtractionQuantizationInfoDataset),
-                                                                                                             InPlaceDataSet))
+                                                                                                             OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -222,19 +191,17 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
-                                                                                                                       datasets::SmallShapes(),
-                                                                                                                       ArithmeticSubtractionQASYMM8SIGNEDDataset),
+                                                                                                                       datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                                                                                                                    ArithmeticSubtractionQuantizationInfoSignedDataset),
-                                                                                                                   InPlaceDataSet))
+                                                                                                                   OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-
 FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
                            datasets::SmallShapesBroadcast(),
-                           ArithmeticSubtractionQASYMM8SIGNEDDataset),
+                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        ArithmeticSubtractionQuantizationInfoSignedDataset),
                        OutOfPlaceDataSet))
@@ -242,12 +209,22 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBr
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
+                           datasets::TinyShapesBroadcastInplace(),
+                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                       ArithmeticSubtractionQuantizationInfoSignedInPlaceDataset),
+                       InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
         datasets::SmallShapes(),
-        ArithmeticSubtractionQSYMM16Dataset),
+        framework::dataset::make("DataType", DataType::QSYMM16)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                                                                                                                      ArithmeticSubtractionQuantizationInfoSymmetric),
                                                                                                              OutOfPlaceDataSet))
@@ -259,7 +236,8 @@ TEST_SUITE_END() // QSYMM16
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS16Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::S16)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                      OutOfPlaceDataSet))
 {
@@ -267,7 +245,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framew
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS16Dataset),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::S16)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                    OutOfPlaceDataSet))
 {
@@ -277,7 +256,8 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framew
 TEST_SUITE_END() // S16
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS32Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::S32)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                      OutOfPlaceDataSet))
 {
@@ -285,7 +265,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int32_t>, framew
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS32Dataset),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::S32)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                    OutOfPlaceDataSet))
 {
@@ -297,7 +278,8 @@ TEST_SUITE_END() // S32
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                    DataType::F16)),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                             OutOfPlaceDataSet))
 {
@@ -308,7 +290,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::F32)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                    InPlaceDataSet))
 {
@@ -316,7 +299,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<float>, framewor
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F32)),
                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                  OutOfPlaceDataSet))
 {
@@ -328,7 +312,7 @@ template <typename T>
 using NEArithmeticSubtractionBroadcastFixture = ArithmeticSubtractionBroadcastValidationFixture<Tensor, Accessor, NEArithmeticSubtraction, T>;
 
 FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ArithmeticSubtractionFP32Dataset),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                        OutOfPlaceDataSet))
 {
@@ -337,7 +321,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixtur
 }
 
 FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapesBroadcast(),
-                       ArithmeticSubtractionFP32Dataset),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                        OutOfPlaceDataSet))
 {
diff --git a/tests/validation/NEON/BatchNormalizationLayer.cpp b/tests/validation/NEON/BatchNormalizationLayer.cpp
index a1ae6971f4..50eaf0c667 100644
--- a/tests/validation/NEON/BatchNormalizationLayer.cpp
+++ b/tests/validation/NEON/BatchNormalizationLayer.cpp
@@ -51,7 +51,7 @@ namespace
 RelativeTolerance<float>           rel_tolerance_f32(0.05f);   /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 constexpr AbsoluteTolerance<float> abs_tolerance_f32(0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr AbsoluteTolerance<float> abs_tolerance_f16(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr AbsoluteTolerance<float> abs_tolerance_f16(0.015f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 #endif                                                       // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 const auto act_infos = framework::dataset::make("ActivationInfo",
diff --git a/tests/validation/NEON/BatchToSpaceLayer.cpp b/tests/validation/NEON/BatchToSpaceLayer.cpp
index a305dcbcc4..8cf11b7b95 100644
--- a/tests/validation/NEON/BatchToSpaceLayer.cpp
+++ b/tests/validation/NEON/BatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,55 +49,38 @@ using NEBatchToSpaceLayerFixture = BatchToSpaceLayerValidationFixture<Tensor, Ac
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blockx > blocky
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blocky > blockx
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),     // Mismatching data types
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),     // Wrong data type block shape
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U, 4U), 1, DataType::F32), // Wrong tensor shape
-                                                     }),
-               framework::dataset::make("BlockShapeInfo",{ TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 4U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(4U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                     })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(64U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 32U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                     })),
-               framework::dataset::make("Expected", { true, true, true, false, false, false})),
-               input_info, block_shape_info, output_info, expected)
-{
-    bool has_error = bool(NEBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), &block_shape_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false)));
-    ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
-}
-DATA_TEST_CASE(ValidateStatic, framework::DatasetMode::ALL, zip(zip(zip(zip(
+DATA_TEST_CASE(ValidateStatic, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blockx > blocky
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blocky > blockx
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),    // Mismatching data types
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),    // Negative block shapes
-                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U, 4U), 1, DataType::F32), // Wrong tensor shape
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: blockx != blocky && blockx > blocky
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: blockx != blocky && blocky > blockx
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),     // Invalid: Mismatching data types
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),     // Invalid: Negative block shapes
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U, 4U), 1, DataType::F32),// Unsupported tensor rank
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid output tensor shape (invalid batch dimension)
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid output tensor shape (invalid spatial dimension)
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: correct tensor shape with cropping
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid tensor shape with cropping
                                                      }),
-               framework::dataset::make("BlockShapeX", { 2, 4, 2, 2, 2, 2 })),
-               framework::dataset::make("BlockShapeY", { 2, 2, 4, 2, -2, 2 })),
+               framework::dataset::make("BlockShapeX", { 2, 4, 2, 2, 2, 2, 2, 2, 2, 2 })),
+               framework::dataset::make("BlockShapeY", { 2, 2, 4, 2, -2, 2, 2, 2, 2, 2 })),
+               framework::dataset::make("CropInfo", {
+                CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{3, 2, 1, 3}, CropInfo{3, 2, 1, 3}
+               })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(64U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 32U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 16U, 2U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 32U, 2U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F16),
                                                        TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 8U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(33U, 32U, 2U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(27, 12U, 2U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, true, false, false, false})),
-               input_info, block_shape_x, block_shape_y, output_info, expected)
+               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true, false})),
+               input_info, block_shape_x, block_shape_y, crop_info, output_info, expected)
 {
-    bool has_error = bool(NEBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), block_shape_x, block_shape_y, &output_info.clone()->set_is_resizable(false)));
+    bool has_error = bool(NEBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), block_shape_x, block_shape_y, &output_info.clone()->set_is_resizable(false), crop_info));
     ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -112,6 +95,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchToSpaceLayerFixture<float>, framework::D
     // Validate output
     validate(Accessor(_target), _reference);
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallWithCropping, NEBatchToSpaceLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallBatchToSpaceLayerWithCroppingDataset(), framework::dataset::make("DataType",
+                                                                                                                       DataType::F32)),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEBatchToSpaceLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeBatchToSpaceLayerDataset(), framework::dataset::make("DataType",
                                                                                                                      DataType::F32)),
                                                                                                              framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
@@ -129,6 +122,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchToSpaceLayerFixture<half>, framework::Da
     // Validate output
     validate(Accessor(_target), _reference);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallWithCropping, NEBatchToSpaceLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallBatchToSpaceLayerWithCroppingDataset(), framework::dataset::make("DataType",
+                                                                                                                       DataType::F16)),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEBatchToSpaceLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeBatchToSpaceLayerDataset(), framework::dataset::make("DataType",
                                                                                                                     DataType::F16)),
                                                                                                             framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
diff --git a/tests/validation/NEON/Cast.cpp b/tests/validation/NEON/Cast.cpp
index db73bea9cb..b56594546b 100644
--- a/tests/validation/NEON/Cast.cpp
+++ b/tests/validation/NEON/Cast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ConvertPolicyDataset.h"
@@ -34,7 +37,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/CastFixture.h"
-
 namespace arm_compute
 {
 namespace test
@@ -99,6 +101,11 @@ const auto CastF32toS32Dataset            = combine(framework::dataset::make("Da
 const auto CastF32toQASYMM8Dataset        = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
 const auto CastF32toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
 
+// U64
+const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32));
+
+// S64
+const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32));
 } // namespace
 
 TEST_SUITE(NEON)
@@ -106,6 +113,8 @@ TEST_SUITE(Cast)
 template <typename T>
 using NECastToU8Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint8_t>;
 template <typename T>
+using NECastToS8Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int8_t>;
+template <typename T>
 using NECastToU16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint16_t>;
 template <typename T>
 using NECastToS16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int16_t>;
@@ -114,6 +123,10 @@ using NECastToU32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, ui
 template <typename T>
 using NECastToS32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int32_t>;
 template <typename T>
+using NECastToU64Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint64_t>;
+template <typename T>
+using NECastToS64Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int64_t>;
+template <typename T>
 using NECastToF16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, half>;
 template <typename T>
 using NECastToF32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, float>;
@@ -187,6 +200,66 @@ CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, NECastToF16Fixture<float>,
 CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, NECastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
 CAST_SUITE(F32_to_U8, DataType::F32, DataType::S32, NECastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
 
+#ifdef __aarch64__
+// S64
+CAST_SUITE(S64_to_F32, DataType::S64, DataType::F32, NECastToF32Fixture<int64_t>, CastS64toF32Dataset, zero_tolerance)
+
+// U64
+CAST_SUITE(U64_to_F32, DataType::U64, DataType::F32, NECastToF32Fixture<uint64_t>, CastU64toF32Dataset, zero_tolerance)
+#endif // __aarch64__
+
+DATA_TEST_CASE(KernelSelectionDstFP16, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType",
+{
+    DataType::F16,
+    DataType::U8,
+    DataType::S32,
+    DataType::QASYMM8,
+    DataType::QASYMM8_SIGNED,
+})),
+cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+    const CpuCastKernel::CastKernel *selected_impl;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = true;
+
+    selected_impl = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ data_type, DataType::F16, cpu_isa }, cpu::KernelSelectionType::Preferred);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_cast";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(KernelSelectionSrcFP32, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType",
+{
+    DataType::F16,
+})),
+cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ DataType::F32, data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_fp32_to_" + cpu_impl_dt(data_type) + "_cast";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE_END() // Cast
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/Col2Im.cpp b/tests/validation/NEON/Col2Im.cpp
index 9139f0cca8..7eb8cbf0f6 100644
--- a/tests/validation/NEON/Col2Im.cpp
+++ b/tests/validation/NEON/Col2Im.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/NECol2ImKernel.h"
+#include "src/cpu/kernels/CpuCol2ImKernel.h"
 #include "tests/NEON/Helper.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
@@ -39,7 +39,7 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(Col2Im)
 
-using NECol2Im = NESynthetizeFunction<NECol2ImKernel>;
+using CpuCol2Im = NESynthetizeFunction<cpu::kernels::CpuCol2ImKernel>;
 
 // *INDENT-OFF*
 // clang-format off
@@ -59,7 +59,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                framework::dataset::make("Expected", { false, false, false, true })),
                input_info, output_info, convolved_width, convolved_height, expected)
 {
-    bool status = bool(NECol2Im::validate(&input_info, &output_info, Size2D(convolved_width, convolved_height)));
+    bool status = bool(CpuCol2Im::validate(&input_info, &output_info, Size2D(convolved_width, convolved_height)));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
diff --git a/tests/validation/NEON/Convolution3D.cpp b/tests/validation/NEON/Convolution3D.cpp
new file mode 100644
index 0000000000..4185488742
--- /dev/null
+++ b/tests/validation/NEON/Convolution3D.cpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEConv3D.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/DirectConvolution3DFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f)); /**< Relative tolerance value for FP16 types */
+const AbsoluteTolerance<float>            abs_tolerance_f16(0.2f);                   /**< Absolute tolerance for FP16 types */
+constexpr float                           tolerance_num = 0.07f;                     /**< Tolerance number for the FP16 implementation */
+#endif                                                                               /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<float>   tolerance_fp32(0.001f);                         /**< Tolerance for floating point tests */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);                           /**< Tolerance for quantized tests */
+
+/** Activation function Dataset*/
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+});
+
+const auto data_precommit = combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                    datasets::SmallDirectConv3DShapes(),
+                                                                                    framework::dataset::make("StrideX", { 1, 5, 8 })),
+                                                                                framework::dataset::make("StrideY", { 1, 2, 3 })),
+                                                                            framework::dataset::make("StrideZ", { 1, 2, 1 })),
+                                                                        framework::dataset::make("PadX", { 0, 1, 2 })),
+                                                                    framework::dataset::make("PadY", { 0, 2, 1 })),
+                                                                framework::dataset::make("PadZ", { 0, 3, 5 })),
+                                                            framework::dataset::make("KernelWidth", { 3, 5, 9 })),
+                                                        framework::dataset::make("KernelHeight", { 2, 1, 3 })),
+                                                    framework::dataset::make("KernelDepth", { 1, 2, 3 })),
+                                                framework::dataset::make("NumKernels", { 2, 3, 8 })),
+                                            framework::dataset::make("HasBias", { true, false })),
+                                    ActivationFunctionsDataset);
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(Convolution3D)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Mismatching data type input/weights
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Mismatching input feature maps
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid weights dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NHWC), // Invalid data layout
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid biases size
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid biases dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid output size
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::U32, DataLayout::NDHWC), // Invalid data type
+                                              }),
+        framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F16),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 3U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U, 3U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::U32),
+                                              })),
+        framework::dataset::make("BiasesInfo",{ TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(3U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U, 2U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                              })),
+        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(26U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::U32),
+                                              })),
+        framework::dataset::make("Expected", { false, false, false, false, false, false, false, false})),
+        input_info, weights_info, biases_info, output_info, expected)
+{
+        const Conv3dInfo  conv3d_info(Size3D(1, 1, 1), Padding3D(0, 0, 0), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false);
+        bool is_valid = bool(NEConv3D::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv3d_info));
+        ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using NEDirectConvolution3DFixture = DirectConvolution3DValidationFixture<Tensor, Accessor, NEConv3D, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(data_precommit,
+                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NDHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END() // FP32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(data_precommit,
+                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                framework::dataset::make("DataLayout", { DataLayout::NDHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE_END() // Float
+
+template <typename T>
+using NEDirectConvolution3DQuantizedFixture = DirectConvolution3DValidationQuantizedFixture<Tensor, Accessor, NEConv3D, T>;
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                                                                           TensorShape(15U, 7U, 11U, 7U),
+                                                                                                                           TensorShape(19U, 5U, 16U, 4U),
+                                                                                                                           TensorShape(13U, 5U, 17U, 2U)
+                                                                                                                                                          }),
+                                                                                                                   framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                                                                                                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                                                                                                   framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                                                                                               framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                                                                                           framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                                                                                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                                                                                   framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                                                                               framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                                                                           framework::dataset::make("HasBias", { true, true, true, false })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                                                                           TensorShape(15U, 7U, 11U, 7U),
+                                                                                                                           TensorShape(19U, 5U, 16U, 4U),
+                                                                                                                           TensorShape(13U, 5U, 17U, 2U)
+                                                                                                                                                          }),
+                                                                                                                   framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                                                                                                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                                                                                                   framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                                                                                               framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                                                                                           framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                                                                                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                                                                                   framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                                                                               framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                                                                           framework::dataset::make("HasBias", { true, true, true, false })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
+TEST_SUITE_END() // Convolution3D
+TEST_SUITE_END() // Neon
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 6b152c9b68..d739d4e1a4 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,11 +28,16 @@
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
+
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/LargeConvolutionLayerDataset.h"
 #include "tests/datasets/SmallConvolutionLayerDataset.h"
-#include "tests/datasets/TinyConvolutionLayerDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -46,6 +51,8 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 namespace detail
 {
 template <>
@@ -77,10 +84,17 @@ const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2
 const AbsoluteTolerance<float>            abs_tolerance_f16(0.2f);                   /**< Absolute tolerance for FP16 types */
 constexpr float                           tolerance_num = 0.07f;                     /**< Tolerance number for the FP16 implementation */
 #endif                                                                               /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+
+#ifdef ARM_COMPUTE_ENABLE_SME
+// TODO(COMPMID-6011): SME kernels and the reference model use different rounding mode.
+// Temporarily increase the tolerance for quantized data.
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(1.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+#else                                                      // ARM_COMPUTE_ENABLE_SME
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+#endif                                                     // ARM_COMPUTE_ENABLE_SME
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -88,14 +102,41 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
     DataType::QASYMM8,
 });
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f)
 });
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto NoActivation = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+});
+
+const auto ActivationFunctionsDatasetNightly = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+#ifdef __aarch64__
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU),
+#endif // __aarch64__
+});
+
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(0.5f, 10),
     QuantizationInfo(0.3f, 3),
@@ -110,32 +151,32 @@ TEST_SUITE(ConvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-                                          framework::dataset::make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F32),
+                                          make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(23U, 27U, 32U, 4U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)
                                           }),
-                                          framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F32),
+                                          make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 32U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
                                           })),
-                                          framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
+                                          make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(19U, 23U, 21U, 4U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
                                           })),
-                                          framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                          make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                                                  PadStrideInfo(1, 1, 0, 0),
                                                                                  PadStrideInfo(2, 1, 0, 0),
                                                                                  PadStrideInfo(3, 2, 1, 0)
                                           })),
-                                          framework::dataset::make("FastMath", { true,
+                                          make("FastMath", { true,
                                                                                  true,
                                                                                  false,
                                                                                  false
                                           })),
-                                                                           framework::dataset::make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
+                                                                           make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
                input_info, weights_info, output_info, conv_info, fast_math, expected)
 {
     ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
@@ -147,30 +188,267 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
 // *INDENT-ON*
 TEST_SUITE_END() // ConvolutionLayer
 
+/*
+    Testing Strategy of Neon Winograd:
+        - There is no need to thoroughly test nchw cases because winograd kernels accept
+          nhwc and the tensors are permuted before and after if they're nchw.
+        - Except relu and bounded relu, testing activations for a single input
+          combination is enough because activation is not fused into winograd and called
+          separately.
+*/
 TEST_SUITE(WinogradLayer)
 template <typename T>
 using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T>;
+template <typename T>
+using NEWinogradConvolutionLayerMixedDataLayoutFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T, T, true, true>;
 
 template <typename T>
 using NEWinogradConvolutionLayerNoBiasFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T, T, false>;
 
+/** Test case for memory injection in @ref cpu::CpuWinogradConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto                winograd = std::make_unique<cpu::CpuWinogradConv2d>();
+    const auto          src_info = TensorInfo(TensorShape(8U, 8U, 32U), 1, DataType::F32);
+    const auto          w_info   = TensorInfo(TensorShape(1U), 1, DataType::F32);
+    const auto          b_info   = TensorInfo(TensorShape(1U, 3U, 32U, 1U), 1, DataType::F32);
+    auto                dst_info = TensorInfo(TensorShape(8U, 6U, 1U), 1, DataType::F32);
+    const PadStrideInfo pad_info{};
+
+    winograd->configure(&src_info, &b_info, &w_info, &dst_info, pad_info);
+
+    // telhs are newly created every call of this lambda function
+    auto a = create_tensor<Tensor>(src_info);
+    auto b = create_tensor<Tensor>(b_info);
+    auto c = create_tensor<Tensor>(w_info);
+    a.allocator()->allocate();
+    b.allocator()->allocate();
+    c.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &a }, { TensorType::ACL_SRC_1, &b }, { TensorType::ACL_SRC_2, &c } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &b }, { TensorType::ACL_SRC_2, &c } };
+
+    auto mg       = MemoryGroup{};
+    auto ws       = manage_workspace<Tensor>(winograd->workspace(), mg, run_pack, prep_pack);
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+        library->fill_tensor_value(Accessor(a), 1.f);
+        library->fill_tensor_value(Accessor(b), 2.f);
+        library->fill_tensor_value(Accessor(c), 3.f);
+
+        // This operator is configured once and captured by this lambda.
+        winograd->prepare(prep_pack);
+        winograd->run(run_pack);
+        return dst;
+    };
+
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEWinogradConvolutionLayer.
+ *
+ * Make sure @ref NEWinogradConvolutionLayer still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto                gemm     = std::make_unique<NEWinogradConvolutionLayer>();
+    const auto          src_info = TensorInfo(TensorShape(8U, 8U, 32U), 1, DataType::F32);
+    const auto          w_info   = TensorInfo(TensorShape(1U), 1, DataType::F32);
+    const auto          b_info   = TensorInfo(TensorShape(1U, 3U, 32U, 1U), 1, DataType::F32);
+    auto                dst_info = TensorInfo(TensorShape(8U, 6U, 1U), 1, DataType::F32);
+    const PadStrideInfo pad_info{};
+
+    auto run_conv = [&]()
+    {
+        auto src = create_tensor<Tensor>(src_info);
+        auto w   = create_tensor<Tensor>(w_info);
+        auto b   = create_tensor<Tensor>(b_info);
+        auto dst = create_tensor<Tensor>(dst_info);
+
+        gemm->configure(&src, &b, &w, &dst, pad_info);
+
+        src.allocator()->allocate();
+        b.allocator()->allocate();
+        w.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(b), 2.f);
+        library->fill_tensor_value(Accessor(w), 3.f);
+        gemm->run();
+        return dst;
+    };
+
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+DATA_TEST_CASE(SupportedKernels, framework::DatasetMode::ALL, zip(
+                   make("WeightsInfo",
+{
+    // Shapes are always in NCHW format. When layout is NHWC, the shape is permuted
+
+    // Fp32, NCHW/NHWC (layout does not matter as it's )
+    // 3x1, 1x3, 3x3 --> all TRUE
+    TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // 5x1, 1x5, 5x5 --> all TRUE
+    TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // 7x1, 1x7, 7x7
+    //  --> all FALSE
+    TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+
+    // unsupported kernel sizes
+    TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // Fp16
+    TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+    // 5x1, 1x5, 5x5 --> all TRUE
+    TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+    // 7x1, 1x7, 7x7
+    //  --> all FALSE
+    TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+
+    // unsupported kernel sizes
+    TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+}),
+make("Expected",
+{
+    // fp32
+    true, true, true,    // 3x3, 1x3, 3x1
+    true, true, true,    // 5x5, 1x5, 5x1
+    false, true, true,   // 7x7, 1x7, 7x1
+    false, false, false, // random unsupported kernels
+
+    // fp16
+    true, false, false,  // 3x3, 1x3, 3x1
+    false, false, false, // 5x5, 1x5, 5x1
+    false, false, false, // 7x7, 1x7, 7x1
+    false, false, false, // random unsupported kernels
+})),
+weights_info_const, expected_const)
+{
+    DataType   data_type   = weights_info_const.data_type();
+    DataLayout data_layout = weights_info_const.data_layout();
+
+    TensorInfo input_info   = TensorInfo(TensorShape(17U, 31U, 2U), 1, data_type);
+    TensorInfo bias_info    = TensorInfo(TensorShape(8U), 1, data_type);
+    TensorInfo weights_info = weights_info_const;
+
+    if(data_layout == DataLayout::NHWC)
+    {
+        // Convert to NHWC
+        PermutationVector perm = PermutationVector(2U, 0U, 1U);
+
+        TensorShape input_shape   = input_info.tensor_shape();
+        TensorShape weights_shape = weights_info.tensor_shape();
+        permute(input_shape, perm);
+        permute(weights_shape, perm);
+
+        input_info.set_tensor_shape(input_shape);
+        weights_info.set_tensor_shape(weights_shape);
+
+        input_info.set_data_layout(data_layout);
+        weights_info.set_data_layout(data_layout);
+        bias_info.set_data_layout(data_layout);
+    }
+
+    PadStrideInfo conv_info(1, 1, 0, 0);
+
+    TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, conv_info);
+    TensorInfo  output_info  = TensorInfo(output_shape, 1, data_type, data_layout);
+
+    Status status = NEWinogradConvolutionLayer::validate(
+                        &input_info,
+                        &weights_info,
+                        &bias_info,
+                        &output_info,
+                        conv_info,
+                        ActivationLayerInfo(),
+                        true /* fast math */);
+
+    Status fp16_supported = ::arm_compute::error_on_unsupported_cpu_fp16("N/A", "N/A", 0, &input_info);
+    bool   expected       = expected_const && static_cast<bool>(fp16_supported);
+
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(FP32)
 
 TEST_SUITE(Conv1x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEWinogradConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("Input", TensorShape(8U, 8U, 32U)),
+                           make("Weight", TensorShape(1U, 3U, 32U, 1U)),
+                           make("Bias", TensorShape(1U)),
+                           make("Output", TensorShape(8U, 6U, 1U)),
+                           make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                           make("Dilation", Size2D(1U, 1U)),
+                           make("DataType", { DataType::F32 }),
+                           ActivationFunctionsDataset,
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -180,19 +458,19 @@ TEST_SUITE_END() // Conv1x3
 
 TEST_SUITE(Conv3x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -202,19 +480,19 @@ TEST_SUITE_END() // Conv3x1
 
 TEST_SUITE(Conv1x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -224,19 +502,19 @@ TEST_SUITE_END() // Conv1x5
 
 TEST_SUITE(Conv5x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -246,10 +524,10 @@ TEST_SUITE_END() // Conv5x1
 
 TEST_SUITE(Conv7x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
@@ -257,9 +535,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                                               make("DataType", { DataType::F32 })),
+                                       make("ActivationInfo", { ActivationLayerInfo() })),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -268,20 +546,20 @@ TEST_SUITE_END() // Conv7x1
 
 TEST_SUITE(Conv1x7)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -290,20 +568,40 @@ TEST_SUITE_END() // Conv1x7
 
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
+
+/// It's enough to run the activations for a single weight/input combination and data type because
+/// activation function is called on top of the winograd output as a separate operator
+/// TODO: Enable after COMPMID-6573 is resolved
+FIXTURE_DATA_TEST_CASE(RunActivations, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::DISABLED,
+                       combine(
+                           make("Input", TensorShape(3U, 3U, 32U)),
+                           make("Weight", TensorShape(3U, 3U, 32U, 4U)),
+                           make("Bias", TensorShape(4U)),
+                           make("Output", TensorShape(1U, 1U, 4U)),
+                           make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                           make("Dilation", Size2D(1U, 1U)),
+                           make("DataType", { DataType::F32 }),
+                           ActivationFunctionsDatasetNightly,
+                           make("DataLayout", { DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, abs_tolerance_f32);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -314,20 +612,20 @@ TEST_SUITE_END() // Conv3x3
 
 TEST_SUITE(Conv5x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -337,12 +635,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, frame
 TEST_SUITE_END() // Conv5x5
 
 FIXTURE_DATA_TEST_CASE(RunSmallNoBias, NEWinogradConvolutionLayerNoBiasFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                                                          datasets::SmallWinogradConvolutionLayer5x5Dataset()),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(framework::dataset::concat(
+                                   datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                   datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
@@ -354,12 +652,39 @@ TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 using CLWinogradConvolutionLayerFastMathFixture16 = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, half, float>;
 
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(
+                   make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F16),
+                                       TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F16)
+                                     }),
+                   make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F16),
+                                         TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F16)
+                                       }),
+                   make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
+                                        TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F16)
+                                      }),
+                   make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                      PadStrideInfo(1, 1, 0, 0)
+                                    }),
+                   make("FastMath",
+{
+    false, // case fp16 and fast_math False then disable Winograd
+    true   // case fp16 and fast_math True then enable Winograd
+}),
+make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD })),
+input_info, weights_info, output_info, conv_info, fast_math, expected)
+{
+    ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
+                                                                            &weights_info.clone()->set_is_resizable(true),
+                                                                            &output_info.clone()->set_is_resizable(true), conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), fast_math);
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F16 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -367,10 +692,10 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F16 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -381,16 +706,470 @@ TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // WinogradLayer
 
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+TEST_SUITE(FIXED_FORMAT_KERNELS)
+TEST_SUITE(VariableWeightUtils)
+
+// UC2_1_* tests: the user requests a specific fixed format, but there is no kernel that supports it.
+
+template <typename ConvolutionClass>
+using HasOptImplFixtureNoFastMath = HasOptImplFixture<ConvolutionClass, /*enable_fast_math*/ false>;
+
+template <typename ConvolutionClass>
+using HasOptImplFixtureFastMath = HasOptImplFixture<ConvolutionClass, /*enable_fast_math*/ true>;
+
+// UC2_1
+
+FIXTURE_DATA_TEST_CASE(UC2_1_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+FIXTURE_DATA_TEST_CASE(UC2_1_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_1_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_1_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+// UC2_2_* tests: the user requests a specific fixed format, and a
+// kernel that support that fixed format is found.
+
+FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo4 })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo4, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo4 })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo4, framework::LogLevel::ERRORS);
+}
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+// These tests currently only works with SVE length 256
+// If other SVE length is used a kernel will fail to be found
+// This needs to be addressed in order to ensure it doesn't revert to FP32 kernels for systems with SVE length other than 256
+FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo8i4_bf16 })))
+{
+    if(Scheduler::get().cpu_info().has_bf16() && (arm_gemm::utils::get_vector_length<float>() == 8)){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT_EQUAL(_computed_weight_format, arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo8i4_bf16 })))
+{
+    if(Scheduler::get().cpu_info().has_bf16() && (arm_gemm::utils::get_vector_length<float>() == 8)){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+    }
+}
+
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+// UC3_1_* tests: the user queries for ANY fixed format, but there is
+// no kernel that support the use case specified by the user (for
+// example, there is no fixed format kernel for the datatype of the
+// problem).
+
+FIXTURE_DATA_TEST_CASE(UC3_1_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+// UC3_2_* tests: the user queries for ANY fixed format. The search
+// succeeded and the fixed format found is prompted back for
+// consumption by the user. Note that we just test the
+// _computed_weight_format to be anything but not the formats that are
+// not fixed formats (ANY and UNSPECIFIED). This is because the weight
+// format that the runtime produces depends on the size of the vector
+// units of the hardware where the tests is executed. For example, a
+// format like OHWIo4 for FP32 data returned for 128-bit NEON hardware
+// is replaced by OHWIo8 when running on 256-bit SVE.
+
+FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+}
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+
+FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    if(Scheduler::get().cpu_info().has_bf16()){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    if(Scheduler::get().cpu_info().has_bf16()){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+}
+
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+namespace
+{
+using TestCaseType          = std::tuple<TensorShape, TensorShape, arm_compute::WeightFormat>;
+auto prepare_weights_shapes = framework::dataset::make("TensorShape",
+{
+    // OHWIo<interleave_by>i<block_by>
+    //
+    // OHWI --> O'HWI', where:
+    //
+    //   O'= smallest multiple of <interleave_by> such that O<=O'
+    //   I'= smallest multiple of <block_by> such that I<=I'
+    //
+
+    // Change N for OHWIo4
+    TestCaseType({ { 1U, 1U, 1U, 1U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 2U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 3U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 4U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 5U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 6U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 7U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 8U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 9U }, { 1U, 1U, 1U, 12U }, arm_compute::WeightFormat::OHWIo4 }),
+    // // Change N for OHWIo8
+    TestCaseType({ { 1U, 1U, 1U, 1U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 2U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 3U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 4U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 5U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 6U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 7U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 8U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 9U }, { 1U, 1U, 1U, 16U }, arm_compute::WeightFormat::OHWIo8 }),
+    // // Change N for OHWIo4 when H, W and C are not 1
+    TestCaseType({ { 3U, 4U, 2U, 1U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 2U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 3U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 4U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 5U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 6U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 7U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 8U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 9U }, { 3, 4, 2, 12 }, arm_compute::WeightFormat::OHWIo4 }),
+
+    // // Fix N and move HWI around, with different data layouts and formats
+    TestCaseType({ { 2U, 4U, 3U, 5U }, { 2, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 5U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 2U, 4U, 3U, 9U }, { 2, 4, 3, 16 }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 3U, 4U, 2U, 9U }, { 3, 4, 2, 16 }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1024U, 1U, 1U, 1001U }, { 1024, 1, 1, 1008 }, arm_compute::WeightFormat::OHWIo8 }),
+
+    // // Adding <block_by> on I (=C)
+    TestCaseType({ { 1U, 4U, 3U, 5U }, { 2, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4i2 }),
+    TestCaseType({ { 2U, 4U, 3U, 5U }, { 2, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4i2 }),
+    TestCaseType({ { 3U, 4U, 3U, 5U }, { 4, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4i2 }),
+
+    // ---------
+    TestCaseType({ { 2, 2, 1, 5 }, { 2, 2, 1, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1, 2, 2, 5 }, { 1, 2, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+
+});
+} // unnamed namespace
+
+DATA_TEST_CASE(PrepareWeightShape, framework::DatasetMode::ALL,
+               prepare_weights_shapes, shapes)
+{
+    const TensorShape               input_shape    = std::get<0>(shapes);
+    const TensorShape               expected_shape = std::get<1>(shapes);
+    const arm_compute::WeightFormat wf             = std::get<2>(shapes);
+    const DataType                  DT             = DataType::F32;
+    const DataLayout                DL             = DataLayout::NHWC;
+    const auto                      TI             = TensorInfo(input_shape, 1 /*num_channels, deprecated*/, DT, DL);
+    const TensorInfo                computed_info  = ::arm_compute::test::validation::prepare_weights(TI, wf);
+    ARM_COMPUTE_EXPECT_EQUAL(computed_info.tensor_shape(), expected_shape, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE_END() // VariableWeightUtils
+
+TEST_SUITE(ExperimentalCpuAPIVariableWeightWithFixtures)
+
+template <typename ScalarType>
+using VarWidth = VariableWeightsFixture<cpu::CpuGemmConv2d, Tensor, Accessor, ScalarType, /*enable_fast_math*/ false>;
+
+FIXTURE_DATA_TEST_CASE(RunSmallFloat, VarWidth<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+FIXTURE_DATA_TEST_CASE(RunSmallHalf, VarWidth<half>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F16 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, 0.f, half(abs_tolerance_f16));
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+template <typename ScalarType>
+using VarWidthFastMath = VariableWeightsFixture<cpu::CpuGemmConv2d, Tensor, Accessor, ScalarType, /*enable_fast_math*/ true>;
+
+FIXTURE_DATA_TEST_CASE(RunSmallFloatFastMath, VarWidthFastMath<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+TEST_SUITE_END() // ExperimentalCpuAPIVariableWeightWithFixtures
+
+TEST_SUITE(ExperimentalNEAPIVariableWeightWithFixtures)
+
+template <typename ScalarType>
+using NEGEMMVarWidth = VariableWeightsFixtureNEInterface<NEGEMMConvolutionLayer, Tensor, Accessor, ScalarType, /*enable_fast_math*/ false>;
+
+FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallFloat, NEGEMMVarWidth<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallHalf, NEGEMMVarWidth<half>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F16 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, 0.f, half(abs_tolerance_f16));
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+template <typename ScalarType>
+using NEGEMMVarWidthFastMath = VariableWeightsFixtureNEInterface<NEGEMMConvolutionLayer, Tensor, Accessor, ScalarType, /*enable_fast_math*/ true>;
+
+FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallFloatFastMath, NEGEMMVarWidthFastMath<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+TEST_SUITE_END() // ExperimentalNEAPIVariableWeightWithFixtures
+TEST_SUITE_END() // FIXED_FORMAT_KERNELS
+
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+
 TEST_SUITE(GEMMConvolutionLayer)
 template <typename T>
 using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>;
+template <typename T>
+using NEGEMMConvolutionLayerPaddedWeightsFixture = ConvolutionValidationPaddedWeightsFixture<Tensor, Accessor, NEConvolutionLayer, T>;
+template <typename T>
+using NEGEMMConvolutionLayerMixedDataLayoutFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T, true>;
+
+/** Test case for memory injection in @ref cpu::CpuGemmConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto        conv        = std::make_unique<cpu::CpuGemmConv2d>();
+    const auto  src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NCHW);
+    auto        dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  conv_info   = PadStrideInfo(1, 1, 0, 0, 2, 2, DimensionRoundingType::FLOOR);
+    WeightsInfo weights_info(false, 3U, 3U, 1U);
+    conv->configure(&src_info, &weight_info, &bias_info, &dst_info, conv_info, weights_info);
+
+    // tensors are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(conv->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        // This operator is configured once and captured by this lambda.
+        conv->prepare(prep_pack);
+        conv->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEGEMMConvolutionLayer.
+ *
+ * Make sure @ref NEGEMMConvolutionLayer still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto        conv        = std::make_unique<NEGEMMConvolutionLayer>();
+    const auto  src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NCHW);
+    auto        dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  conv_info   = PadStrideInfo(1, 1, 0, 0, 2, 2, DimensionRoundingType::FLOOR);
+    WeightsInfo weights_info(false, 3U, 3U, 1U);
+    auto        run_conv = [&]()
+    {
+        auto src    = create_tensor<Tensor>(src_info);
+        auto weight = create_tensor<Tensor>(weight_info);
+        auto bias   = create_tensor<Tensor>(bias_info);
+        auto dst    = create_tensor<Tensor>(dst_info);
+        conv->configure(&src, &weight, &bias, &dst, conv_info, weights_info);
+        src.allocator()->allocate();
+        weight.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        conv->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
 
 TEST_SUITE(Float)
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
+#if defined(ARM_COMPUTE_ENABLE_BF16)
 TEST_SUITE(BFLOAT16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                                                                     framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                    framework::dataset::make("DataType", DataType::BFLOAT16)),
+                                                                                                                    framework::dataset::make("DataType", Scheduler::get().cpu_info().has_bf16() ? DataType::BFLOAT16 : DataType::F32)),
                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                                                                                                             ActivationFunctionsDataset))
 {
@@ -398,7 +1177,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
 }
 TEST_SUITE_END() // BFLOAT16
-#endif           /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
+#endif           /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -424,11 +1203,59 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                           framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                           framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                       framework::dataset::make("Bias", TensorShape(2U))),
+                                                                               framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
+                                                                       framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                               framework::dataset::make("Dilation", Size2D(1, 1))),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::F32)),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                               ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+/** Padded weights
+ * CpuGemmConv2d uses two different paths for reshaping the weights based on if the weight tensor has holes (a common
+ * way to have "holes" in tensor is via extended paddings)
+ *
+ * We only need to test the padded weight path here on a single floating data type and a single layout, because the fallback path is agnostic of them
+ */
+FIXTURE_DATA_TEST_CASE(RunPaddedWeights, NEGEMMConvolutionLayerPaddedWeightsFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                    framework::dataset::make("ReshapeWeights", { true }),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })
+                                                                                                            ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
+// This very large shape test is required to test heuristic paths where the tensor size is > 1e7 bytes
+// and weight dimensions larger than 7
+FIXTURE_DATA_TEST_CASE(RunVeryLarge, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::VeryLargeConvolutionLayerDataset(),
+        framework::dataset::make("ReshapeWeights", { true }),
+        framework::dataset::make("DataType", DataType::F32),
+        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+// TODO: COMPMID-6596 Extend quantized tests with at least one suite where the weight is padded (the legacy case, see floating point's RunPaddedWeights)
 template <typename T>
 using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T>;
+template <typename T>
+using NEGEMMConvolutionLayerQuantizedMixedDataLayoutFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T, true>;
 
 template <typename T>
 using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEConvolutionLayer, T, int8_t>;
@@ -440,17 +1267,39 @@ const auto QuantizedActivationFunctionsDataset = framework::dataset::make("Activ
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
 });
 TEST_SUITE(Quantized)
+/// @note: Every asymmetric quantized test where there's no fused activation will have its quantization info ignored
+/// This is because instead of using the same quantization information for all the tensors, the fixture generates
+/// separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, these two versions should be merged
+/// again, with the explicitly specified quantization info removed
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                                                                                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
                                                                                                                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                   framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                                   framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                               framework::dataset::make("Bias", TensorShape(2U))),
+                                                                                       framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
+                                                                               framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                                       framework::dataset::make("Dilation", Size2D(1, 1))),
+                                                               framework::dataset::make("ReshapeWeights", { true })),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
+                               QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
@@ -458,12 +1307,29 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<int8_t>,
                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })),
+                                                                                                                      framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(0.01f, -10) })),
                                                                                                                       QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                   framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                                   framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                               framework::dataset::make("Bias", TensorShape(2U))),
+                                                                                       framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
+                                                                               framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                                       framework::dataset::make("Dilation", Size2D(1, 1))),
+                                                               framework::dataset::make("ReshapeWeights", { true })),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
+                               QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM8_PER_CHANNEL)
@@ -491,6 +1357,27 @@ FIXTURE_DATA_TEST_CASE(RunSmallSigned, NEGEMMConvolutionLayerQuantizedPerChannel
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+
+FIXTURE_DATA_TEST_CASE(MemoryStressLargeChannels, NEGEMMConvolutionLayerQuantizedPerChannelFixture<int8_t>,
+    framework::DatasetMode::ALL,
+        combine(
+            make("In", TensorShape(1U)),
+            make("Weights", TensorShape(1U, 1U, 1U, 17000U)),
+            make("Biases", TensorShape(17000U)),
+            make("Out", TensorShape(1U, 1U, 17000U)),
+            make("Info", PadStrideInfo(1, 1, 0, 0)),
+            make("Dilation", Size2D(1, 1)),
+            make("ReshapeWeights", { true }),
+            make("DataType", { DataType::QASYMM8_SIGNED }),
+            make("DataLayout", { DataLayout::NHWC }),
+            make("QuantizationInfo", QuantizationInfo(0.5f, 10)),
+            make("ActivationInfo", ActivationLayerInfo()),
+            make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 
@@ -500,6 +1387,99 @@ TEST_SUITE(DirectGEMMConv2d)
 template <typename T>
 using NEDirectGEMMConv2dLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConv2d, T>;
 
+/** Test case for memory injection in @ref cpu::CpuGemmDirectConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       conv        = std::make_unique<cpu::CpuGemmDirectConv2d>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto conv_info   = Conv2dInfo{};
+    conv->configure(&src_info, &weight_info, &bias_info, &dst_info, conv_info);
+
+    // tensors are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(conv->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        // This operator is configured once and captured by this lambda.
+        conv->prepare(prep_pack);
+        conv->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEGEMMConv2d.
+ *
+ * Make sure @ref NEGEMMConv2d still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto       conv        = std::make_unique<NEGEMMConv2d>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto conv_info   = Conv2dInfo{};
+    auto       run_conv    = [&]()
+    {
+        auto src    = create_tensor<Tensor>(src_info);
+        auto weight = create_tensor<Tensor>(weight_info);
+        auto bias   = create_tensor<Tensor>(bias_info);
+        auto dst    = create_tensor<Tensor>(dst_info);
+        conv->configure(&src, &weight, &bias, &dst, conv_info);
+        src.allocator()->allocate();
+        weight.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        conv->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
diff --git a/tests/validation/NEON/DeconvolutionLayer.cpp b/tests/validation/NEON/DeconvolutionLayer.cpp
index adb5d1709d..b4c049f6f9 100644
--- a/tests/validation/NEON/DeconvolutionLayer.cpp
+++ b/tests/validation/NEON/DeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,54 +47,86 @@ constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f);    /**< Tolerance for
 constexpr AbsoluteTolerance<float> tolerance_quantized(1.0f); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 const RelativeTolerance<half_float::half> tolerance_fp16(half_float::half(0.2f)); /**< Relative tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr float                           tolerance_num_fp16 = 0.02f;             /**< Tolerance number for FP16 tests -- follows a slightly stricter approach compared to ConvolutionLayer tests */
 #endif                                                                            /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-constexpr float tolerance_num = 0.07f;                                            /**< Tolerance number */
+constexpr float tolerance_num_quant = 0.07f;                                      /**< Tolerance number for quantized types */
 
 const auto data4x4 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 3)
-                     * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data3x3 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 2)
-                     * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data3x3_asymm = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadLeft", 0, 1)
-                           * framework::dataset::make("PadRight", 0, 1) * framework::dataset::make("PadTop", 0, 1) * framework::dataset::make("PadBottom", 0, 1) * framework::dataset::make("NumKernels", { 3 });
+                           * framework::dataset::make("PadRight", 0, 1) * framework::dataset::make("PadTop", 0, 1) * framework::dataset::make("PadBottom", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
-const auto data9x9_small_asymm = framework::dataset::make("InputShape", TensorShape{ 10U, 10U, 1U, 1U }) *framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY",
-                                 2)
-                                 *framework::dataset::make("PadLeft", 3)
-                                 *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
+const auto data9x9_small_asymm = framework::dataset::make("InputShape", TensorShape
+{
+    10U, 10U, 1U, 1U
+})
+*framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY", 2) *framework::dataset::make("PadLeft", 3) *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop",
+        3)  *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
 
-const auto data9x9_large_asymm = framework::dataset::make("InputShape", TensorShape{ 640U, 360U, 56U, 1U }) *framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY",
-                                 2)
-                                 *framework::dataset::make("PadLeft", 3)
-                                 *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
+const auto data9x9_large_asymm = framework::dataset::make("InputShape", TensorShape
+{
+    640U, 360U, 56U, 1U
+})
+*framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY", 2) *framework::dataset::make("PadLeft", 3) *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop",
+        3)  *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
 
 const auto data3x3_precommit = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadX", 0, 2)
-                               * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
+                               * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
-                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
+
+const auto data5x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
+                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
-const auto data_layouts_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
+const auto data_layouts_dataset = framework::dataset::make("DataLayout",
+{
+    DataLayout::NCHW, DataLayout::NHWC
+});
 
-const auto add_bias_dataset = framework::dataset::make("AddBias", { true, false });
+const auto add_bias_dataset = framework::dataset::make("AddBias",
+{
+    true, false
+});
 
 const auto input_qinfo_dataset = framework::dataset::make("InputQInfo",
 {
     QuantizationInfo(1.f / 255.f, 0),
-    QuantizationInfo(2.f, 0),
+                     QuantizationInfo(2.f, 0),
 });
 
 const auto output_qinfo_dataset = framework::dataset::make("OutputQInfo",
 {
     QuantizationInfo(3.f / 255.f, 0),
-    QuantizationInfo(4.f, 0),
+                     QuantizationInfo(4.f, 0),
 });
+
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(DeconvolutionLayer)
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
@@ -104,6 +136,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),  // Invalid bias shape
                                             TensorInfo(TensorShape(13U, 11U, 4U, 3U), 1, DataType::F32), // Window shrink
                                             TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(2U,2U,1U,1U), 1, DataType::F32),    // Small shape no padding
+                                            TensorInfo(TensorShape(3U,26U,26U,1U), 1, DataType::F32),    // Negative padding
                                           }),
     framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
@@ -111,6 +145,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(3U, 2U, 2U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(3U, 3U, 4U), 1, DataType::F32),
                                               TensorInfo(TensorShape(1U, 1U, 2U, 4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(3U,3U,1U,1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U,1U,26U,88U), 1, DataType::F32),
                                           })),
     framework::dataset::make("BiasInfo",  { TensorInfo(TensorShape(1U), 1, DataType::F16),
                                             TensorInfo(TensorShape(1U), 1, DataType::F32),
@@ -118,6 +154,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(25U, 11U), 1, DataType::F32),
                                             TensorInfo(TensorShape(1U), 1, DataType::F32),
                                             TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(88U), 1, DataType::F32),
                                           })),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(25U, 10U, 2U), 1, DataType::F32),
@@ -125,6 +163,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(13U, 13U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(11U, 9U, 1U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(32U, 16U, 4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(4U,4U,1U,1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U,78U,88U,1U), 1, DataType::F32),
                                           })),
     framework::dataset::make("PadStrideInfo", { PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 0, 0),
@@ -132,8 +172,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                 PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 1, 1),
                                                 PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(2, 3, 3, 1),
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, false, false, true,true, false })),
     input_info, weights_info, bias_info, output_info, pad_info, expected)
 {
     bool is_valid = bool(NEDeconvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pad_info));
@@ -157,6 +199,9 @@ using NEDeconvolutionLayerAsymmFixture9x9 = DeconvolutionValidationAsymmFixture<
 template <typename T>
 using NEDeconvolutionLayerFixture1x1 = DeconvolutionValidationFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 1, 1>;
 
+template <typename T>
+using NEDeconvolutionLayerFixture5x1 = DeconvolutionValidationFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 5, 1>;
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 TEST_SUITE(W4x4)
@@ -220,6 +265,15 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerAsymmFixture9x9<float>, fra
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
 TEST_SUITE_END() // W9x9
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture5x1<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data5x1, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                    data_layouts_dataset),
+                                                                                                            add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END() // W5x1
 TEST_SUITE_END() // FP32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -230,7 +284,7 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture4x4<half>, framework::Dat
                                                                                                            add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 TEST_SUITE_END() // W4x4
 TEST_SUITE(W3x3)
@@ -240,14 +294,14 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDeconvolutionLayerFixture3x3<half>, framework
                                                                                                                   add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data3x3, framework::dataset::make("DataType", DataType::F16)),
                                                                                                                         data_layouts_dataset),
                                                                                                                 add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 TEST_SUITE_END() // W3x3
 TEST_SUITE(W1x1)
@@ -256,9 +310,18 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture1x1<half>, framework::Dat
                                                                                                            add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 TEST_SUITE_END() // W1x1
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture5x1<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data5x1, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                   data_layouts_dataset),
+                                                                                                           add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
+}
+TEST_SUITE_END() // W5x1
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
@@ -273,6 +336,21 @@ using NEDeconvolutionLayerQuantizedFixture3x3 = DeconvolutionValidationQuantized
 template <typename T>
 using NEDeconvolutionLayerQuantizedFixture1x1 = DeconvolutionValidationQuantizedFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 1, 1>;
 
+template <typename T>
+using NEDeconvolutionLayerQuantizedFixture5x1 = DeconvolutionValidationQuantizedFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 5, 1>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture4x4 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 4, 4>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture3x3 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 3, 3>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture1x1 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 1, 1>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture5x1 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 5, 1>;
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 
@@ -285,7 +363,7 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture4x4<uint8_t>, fr
                                                                                                                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W4x4
 
@@ -299,7 +377,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDeconvolutionLayerQuantizedFixture3x3<uint8_t
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data3x3,
                        framework::dataset::make("DataType",
@@ -310,7 +388,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<uint8_t
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W3x3
 
@@ -323,10 +401,23 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture1x1<uint8_t>, fr
                                                                                                                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W1x1
 
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture5x1<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data5x1, framework::dataset::make("DataType",
+                                                                                                                       DataType::QASYMM8)),
+                                                                                                                       data_layouts_dataset),
+                                                                                                                       input_qinfo_dataset),
+                                                                                                                       output_qinfo_dataset),
+                                                                                                                       add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W5x1
+
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
@@ -340,7 +431,7 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture4x4<int8_t>, fra
                                                                                                                       add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W4x4
 
@@ -354,7 +445,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDeconvolutionLayerQuantizedFixture3x3<int8_t>
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data3x3,
                        framework::dataset::make("DataType",
@@ -365,24 +456,160 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<int8_t>
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(W1x1)
-FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture1x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1, framework::dataset::make("DataType",
-                                                                                                                      DataType::QASYMM8_SIGNED)),
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture1x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1,
+                                                                                                                      framework::dataset::make("DataType",
+                                                                                                                              DataType::QASYMM8_SIGNED)),
                                                                                                                       data_layouts_dataset),
                                                                                                                       input_qinfo_dataset),
                                                                                                                       output_qinfo_dataset),
                                                                                                                       add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W1x1
 
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture5x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data5x1, framework::dataset::make("DataType",
+                                                                                                                      DataType::QASYMM8_SIGNED)),
+                                                                                                                      data_layouts_dataset),
+                                                                                                                      input_qinfo_dataset),
+                                                                                                                      output_qinfo_dataset),
+                                                                                                                      add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W5x1
+
 TEST_SUITE_END() // QASYMM8_SIGNED
+
+const auto input_qinfo_per_channel_dataset = framework::dataset::make("InputQuantizationInfo",
+{
+    QuantizationInfo(1.f / 255.f, 10)
+});
+const auto output_qinfo_per_channel_dataset = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(3.f / 255.f, 0)
+});
+const auto input_signed_qinfo_per_channel_dataset = framework::dataset::make("InputQuantizationInfo",
+{
+    QuantizationInfo(1.f / 255.f, -10)
+});
+const auto output_signed_qinfo_per_channel_dataset = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(3.f / 255.f, 10)
+});
+
+TEST_SUITE(QSYMM8_PER_CHANNEL)
+
+TEST_SUITE(W4x4)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture4x4<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data4x4,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture4x4<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data4x4,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W4x4
+
+TEST_SUITE(W3x3)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture3x3<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data3x3,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture3x3<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data3x3,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W3x3
+
+TEST_SUITE(W1x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture1x1<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data1x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture1x1<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data1x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W1x1
+
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture5x1<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data5x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture5x1<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data5x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W5x1
+
+TEST_SUITE_END() // QSYMM8_PER_CHANNEL
+
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // DeconvolutionLayer
diff --git a/tests/validation/NEON/DepthConvertLayer.cpp b/tests/validation/NEON/DepthConvertLayer.cpp
index 60631181bf..4972708144 100644
--- a/tests/validation/NEON/DepthConvertLayer.cpp
+++ b/tests/validation/NEON/DepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,25 +56,21 @@ const auto DepthConvertLayerU16toU8Dataset      = combine(framework::dataset::ma
 const auto DepthConvertLayerU16toU32Dataset     = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U32));
 const auto DepthConvertLayerS16toU8Dataset      = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U8));
 const auto DepthConvertLayerS16toS32Dataset     = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S32));
-const auto DepthConvertLayerBF16toF32Dataset    = combine(framework::dataset::make("DataType", DataType::BFLOAT16), framework::dataset::make("DataType", DataType::F32));
 const auto DepthConvertLayerF16toU8Dataset      = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U8));
 const auto DepthConvertLayerF16toF32Dataset     = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32));
 const auto DepthConvertLayerF16toS32Dataset     = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S32));
 const auto DepthConvertLayerF32toF16Dataset     = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
 const auto DepthConvertLayerF32toS32Dataset     = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S32));
 const auto DepthConvertLayerF32toU8Dataset      = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U8));
-const auto DepthConvertLayerF32toBF16Dataset    = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::BFLOAT16));
 
 const auto DepthConvertLayerS32toF32Dataset     = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F32));
 const auto DepthConvertLayerS32toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::QASYMM8));
 const auto DepthConvertLayerS32toF16Dataset     = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F16));
 const auto DepthConvertLayerS32toU8Dataset      = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U8));
 
-const auto DepthConvertLayerF16toQASYMM8Dataset   = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::QASYMM8));
-const auto DepthConvertLayerF32toQASYMM8Dataset   = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
-const auto DepthConvertLayerShiftDatasetNightly   = framework::dataset::make("Shift", 0, 7);
-const auto DepthConvertLayerShiftDatasetPrecommit = framework::dataset::make("Shift", { 0, 3, 6 });
-const auto DepthConvertLayerZeroShiftDataset      = framework::dataset::make("Shift", 0);
+const auto DepthConvertLayerF16toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::QASYMM8));
+const auto DepthConvertLayerF32toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
+const auto DepthConvertLayerZeroShiftDataset    = framework::dataset::make("Shift", 0);
 
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int32_t> tolerance_one_int32(1);
@@ -108,7 +104,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                    ConvertPolicy::WRAP,
                                                    ConvertPolicy::WRAP,
                                                      })),
-               framework::dataset::make("Shift",{ 1, 1, 1, 1, 1, 1, 8, 1,
+               framework::dataset::make("Shift",{ 0, 0, 0, 1, 1, 1, 8, 1,
                                                      })),
                framework::dataset::make("Expected", { false, false, false, false, true})),
                input_info, output_info, policy, shift, expected)
@@ -129,8 +125,6 @@ using NEDepthConvertLayerToU8Fixture = DepthConvertLayerValidationFixture<Tensor
 template <typename T>
 using NEDepthConvertLayerToU32Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, uint32_t>;
 template <typename T>
-using NEDepthConvertLayerToBF16Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, bfloat16>;
-template <typename T>
 using NEDepthConvertLayerToF16Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, half>;
 template <typename T>
 using NEDepthConvertLayerToF32Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, float>;
@@ -188,7 +182,7 @@ TEST_SUITE_END() // QASYMM8_to_S32
 TEST_SUITE(U8_to_U16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toU16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -196,7 +190,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toU16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -206,7 +200,7 @@ TEST_SUITE_END() // U8_to_U16
 TEST_SUITE(U8_to_S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toS16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -214,7 +208,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToS16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toS16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -223,7 +217,7 @@ TEST_SUITE_END() // U8_to_S16
 TEST_SUITE(U8_to_S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS32Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toS32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -231,7 +225,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS32Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToS32Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toS32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -241,7 +235,7 @@ TEST_SUITE_END() // U8_to_S32
 TEST_SUITE(U8_to_F32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF32Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toF32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -249,7 +243,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF32Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToF32Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toF32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -260,7 +254,7 @@ TEST_SUITE_END() // U8_to_F32
 TEST_SUITE(U8_to_F16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toF16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -268,7 +262,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToF16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toF16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -279,14 +273,14 @@ TEST_SUITE_END() // U8_to_F36
 TEST_SUITE(U16_to_U8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU8Fixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU16toU8Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU8Fixture<uint16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU16toU8Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -296,14 +290,14 @@ TEST_SUITE_END() // U16_to_U8
 TEST_SUITE(U16_to_U32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU32Fixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU16toU32Dataset),
                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                       DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                       DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU32Fixture<uint16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU16toU32Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerShiftDatasetNightly))
+                                                                                                                     DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -313,14 +307,14 @@ TEST_SUITE_END() // U16_to_U32
 TEST_SUITE(S16_to_U8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU8Fixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerS16toU8Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                     DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU8Fixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerS16toU8Dataset),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                   DepthConvertLayerShiftDatasetNightly))
+                                                                                                                   DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -330,42 +324,20 @@ TEST_SUITE_END() // S16_to_U8
 TEST_SUITE(S16_to_S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS32Fixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerS16toS32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToS32Fixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerS16toS32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S16_to_S32
 
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-TEST_SUITE(BFLOAT16_to_F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF32Fixture<bfloat16>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerBF16toF32Dataset),
-                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                       DepthConvertLayerZeroShiftDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // BFLOAT16_to_F32
-
-TEST_SUITE(F32_to_BFLOAT16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToBF16Fixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerF32toBF16Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerZeroShiftDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // F32_to_BFLOAT16
-#endif           /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16_to_QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToQASYMM8Fixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 6bb40be036..e9609b7b72 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,32 +42,70 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-constexpr RelativeTolerance<float>   tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+constexpr RelativeTolerance<float>   tolerance_f32(0.01f);        /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);        /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8_SIGNED */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.02)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr float                     tolerance_num = 0.05f;                 /**< Tolerance number */
 #endif                                                                     // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-const auto depth_multipliers       = framework::dataset::make("DepthMultiplier", { 1, 2, 8 });
-const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5, 32 });
+const auto depth_multipliers       = make("DepthMultiplier", { 1, 2, 8 });
+const auto large_depth_multipliers = make("DepthMultiplier", { 5, 32 });
 
-//Activation Functions
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+// Activation Functions
+const auto NoActivation = make("ActivationInfo", ActivationLayerInfo());
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
 });
 
-const auto input_qinfo_dataset = framework::dataset::make("InputQInfo",
+const auto ActivationFunctionsDatasetNightly = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+#ifdef __aarch64__
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU),
+#endif // __aarch64__
+});
+
+const auto ActivationFunctionsQuantizedSmallDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+});
+
+const auto ActivationFunctionsQuantizedDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+});
+
+// This is only used when there is fused activation
+const auto input_qinfo_dataset = make("InputQInfo",
 {
     QuantizationInfo(0.3f, 10),
     QuantizationInfo(2.2f, 10),
 });
+
+const auto IgnoredQuantizationInfo = make("IgnoredQuantizationInfo", QuantizationInfo());
+
 } // namespace
 
 TEST_SUITE(NEON)
@@ -76,7 +114,7 @@ TEST_SUITE(DepthwiseConvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
+               make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
                                                        TensorInfo(TensorShape(32U, 18U, 3U), 1, DataType::F32),     // Mismatching input feature maps
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Unsupported weights dimensions
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching depth multiplier
@@ -88,7 +126,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // dilation < 1
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                      }),
-               framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
+               make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(5U, 5U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
@@ -100,7 +138,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                        })),
-               framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+               make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
@@ -112,7 +150,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+               make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
@@ -124,7 +162,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+               make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
@@ -136,7 +174,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                      })),
-               framework::dataset::make("DepthMultiplier", { 1,
+               make("DepthMultiplier", { 1,
                                                              1,
                                                              1,
                                                              3,
@@ -148,7 +186,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                              1,
                                                              1,
                                                             })),
-               framework::dataset::make("Dilation", { Size2D(1U, 1U),
+               make("Dilation", { Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
@@ -160,7 +198,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                       Size2D(0U, 1U),
                                                       Size2D(1U, 1U),
                                                             })),
-               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
+               make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
                input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
     bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false),
@@ -169,7 +207,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
 }
 
 DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
+                make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
                                                         TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),     // Mismatching input feature maps
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching depth multiplier
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Invalid biases size
@@ -178,7 +216,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Patch size bigger than input width
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Dilation < 1
                                                       }),
-                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
+                make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
@@ -187,7 +225,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
                                                         })),
-                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+                make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(4U), 1, DataType::F32),
@@ -196,7 +234,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
                                                        })),
-                framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
@@ -205,7 +243,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
                                                        })),
-                framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
@@ -214,7 +252,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                       })),
-                framework::dataset::make("DepthMultiplier", { 1,
+                make("DepthMultiplier", { 1,
                                                               1,
                                                               3,
                                                               1,
@@ -223,7 +261,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                               2,
                                                               2,
                                                              })),
-                framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                make("Dilation", { Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
@@ -232,7 +270,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        Size2D(25U, 1U),
                                                        Size2D(0U, 1U),
                                                              })),
-                framework::dataset::make("Expected", { false, false, false, false, false, false, false, false})),
+                make("Expected", { false, false, false, false, false, false, false, false})),
                 input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
     bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
@@ -242,36 +280,60 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
 // *INDENT-ON*
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
+template <typename T>
+using NEDepthwiseConvolutionLayerMixedDataLayoutFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T, true>;
+template <typename T>
+using NEDepthwiseConvolutionLayerVariableWeightsFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T, false, false, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(F32)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::F32),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsDatasetNightly))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                            depth_multipliers),
-                           framework::dataset::make("DataType",
-                                                    DataType::F32)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                            ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, NEDepthwiseConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                           make("DepthMultiplier", { 2 })),
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                            large_depth_multipliers),
-                           framework::dataset::make("DataType",
-                                                    DataType::F32)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                           ActivationFunctionsDataset))
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NHWC })),
+                           make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -279,10 +341,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>,
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -292,9 +354,9 @@ TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                            depth_multipliers),
-                           framework::dataset::make("DataType",
+                           make("DataType",
                                                     DataType::F32)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                            ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -302,10 +364,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>,
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -313,9 +375,9 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -323,10 +385,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>,
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -337,31 +399,70 @@ TEST_SUITE_END() // W3x3
 TEST_SUITE(Optimized)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsSmall3x3, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout3x3, NEDepthwiseConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType", DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsSmall5x5, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F32)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsLarge3x3, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -370,22 +471,37 @@ TEST_SUITE_END() // F32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::F16),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsDatasetNightly))
+{
+    validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                            depth_multipliers),
-                           framework::dataset::make("DataType",
+                           make("DataType",
                                                     DataType::F16)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                            ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                                                                                                                         large_depth_multipliers),
-                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                        make("DataType",
                                                                                                                                 DataType::F16)),
-                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                        ActivationFunctionsDataset))
+                                                                                                                        make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
@@ -394,9 +510,8 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                   make("DataType", DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -404,10 +519,9 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, f
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
-                                                                            DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                   make("DataType", DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
@@ -419,9 +533,9 @@ using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFi
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                            depth_multipliers),
-                           framework::dataset::make("DataType",
+                           make("DataType",
                                                     DataType::F16)),
-                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                            ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -429,10 +543,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, f
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
@@ -442,9 +556,9 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -452,10 +566,10 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, f
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                            large_depth_multipliers),
-                                                   framework::dataset::make("DataType",
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
@@ -466,31 +580,31 @@ TEST_SUITE_END() // W3x3
 TEST_SUITE(Optimized)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmallW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunSmallW5x5, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLargeW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                   framework::dataset::make("DataType",
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
                                                                             DataType::F16)),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
@@ -501,94 +615,162 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 template <typename T>
-using NEDepthwiseConvolutionLayerQuantizedFixtureOptimized = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
-template <typename T>
 using NEDepthwiseConvolutionLayerQuantizedFixture                    = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
+template <typename T>
+using NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture     = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T, true>;
 using NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture = DepthwiseConvolutionLayerValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, uint8_t, int8_t>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::QASYMM8),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        make("DepthMultiplier", { 2 }),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                           large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                           large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
 TEST_SUITE(Dilation)
-
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                           large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -596,39 +778,69 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
-                                                                                            DataType::QASYMM8)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
-                                                                                            DataType::QASYMM8)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
-                                                                                            DataType::QASYMM8)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout3x3, NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -636,143 +848,242 @@ TEST_SUITE_END() // Optimized
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                           large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                           large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                           large_depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
-                                                                                            DataType::QASYMM8_SIGNED)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                           combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
-                                                                                            DataType::QASYMM8_SIGNED)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::NIGHTLY,
-                           combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                           framework::dataset::make("DepthMultiplier", 1)),
-                                                                   framework::dataset::make("DataType",
-                                                                                            DataType::QASYMM8_SIGNED)),
-                                                           input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                   ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // Optimized
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM8_PER_CHANNEL)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("InputDataType", DataType::QASYMM8),
+        make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE(Generic)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -782,11 +1093,11 @@ TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -794,12 +1105,12 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetr
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                                                    depth_multipliers),
-                                                                           framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -809,25 +1120,25 @@ TEST_SUITE_END() // Generic
 TEST_SUITE(Optimized)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
                            combine(combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                                   framework::dataset::make("DepthMultiplier", 1)),
-                                                                           framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                                                   make("DepthMultiplier", 1)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
                                    ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
                            combine(combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                                   framework::dataset::make("DepthMultiplier", 1)),
-                                                                           framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                                   framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                                                   make("DepthMultiplier", 1)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
                                                            input_qinfo_dataset),
-                                                   framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                   ActivationFunctionsDataset))
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
index 3314227bec..221fc5d249 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/framework/Macros.h"
@@ -37,12 +38,12 @@ namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
 
-// Create function for NEDepthwiseConvolutionLayerKernel
-using NEDepthwiseConvolutionLayerNative = NESynthetizeFunctionWithZeroConstantKernelBorder<NEDepthwiseConvolutionLayerNativeKernel>;
+// Create function for CpuDepthwiseConvolutionKernel
+using CpuDepthwiseConvolutionNative = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuDepthwiseConv2dNativeKernel>;
 
 // Fixture for NEDepthwiseConvolutionLayerKernel
 template <typename T>
-using NEDepthwiseConvolutionLayerNativeFixture = DepthwiseConvolutionLayerNativeValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayerNative, T>;
+using CpuDepthwiseConvolutionNativeFixture = DepthwiseConvolutionLayerNativeValidationFixture<Tensor, Accessor, CpuDepthwiseConvolutionNative, T>;
 
 namespace
 {
@@ -124,8 +125,9 @@ TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
     auto biases  = create_tensor<Tensor>(bias_shape, data_type, 1, QuantizationInfo(), data_layout);
     auto dst     = create_tensor<Tensor>(TensorShape(), data_type, 1, QuantizationInfo(), data_layout);
 
-    NEDepthwiseConvolutionLayerNativeKernel dwc;
-    dwc.configure(&src, &weights, &biases, &dst, pad_stride_info);
+    cpu::kernels::CpuDepthwiseConv2dNativeKernel dwc;
+    const ConvolutionInfo info{pad_stride_info, 1, ActivationLayerInfo(), Size2D(1, 1)};
+    dwc.configure(src.info(), weights.info(), biases.info(), dst.info(), info);
 
     ARM_COMPUTE_EXPECT(src.info()->padding().empty(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(weights.info()->padding().empty(), framework::LogLevel::ERRORS);
@@ -133,9 +135,47 @@ TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(dst.info()->padding().empty(), framework::LogLevel::ERRORS);
 }
 
+TEST_SUITE(KERNEL_SELECTION)
+DATA_TEST_CASE(KernelSelection_mul_and_add, framework::DatasetMode::ALL,
+               combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QASYMM8,
+                                                              DataType::QSYMM8_PER_CHANNEL
+                                                            })),
+                       framework::dataset::make("DataType_per_channel", { DataType::QASYMM8,
+                                                                          DataType::QASYMM8_SIGNED
+                                                            })),
+                cpu_ext, data_type, data_type_per_channel)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuDepthwiseConv2dNativeKernel::get_implementation(
+        DepthwiseConv2dNativeDataTypeISASelectorData{ data_type, data_type_per_channel,cpu_isa },
+        cpu::KernelSelectionType::Preferred );
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string per_channel_str = "_";
+    if (data_type == DataType::QSYMM8_PER_CHANNEL)
+    {
+        per_channel_str = "_" + cpu_impl_dt(data_type_per_channel) + "_" ;
+    }
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type)  + per_channel_str + "deptwiseconv2dnative";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // KERNEL_SELECTION
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CpuDepthwiseConvolutionNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_precommit,
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
@@ -152,7 +192,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerNativeFixture<fl
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::NIGHTLY,
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CpuDepthwiseConvolutionNativeFixture<float>, framework::DatasetMode::NIGHTLY,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_nightly,
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
diff --git a/tests/validation/NEON/DetectionPostProcessLayer.cpp b/tests/validation/NEON/DetectionPostProcessLayer.cpp
index a166402a79..7d725327b7 100644
--- a/tests/validation/NEON/DetectionPostProcessLayer.cpp
+++ b/tests/validation/NEON/DetectionPostProcessLayer.cpp
@@ -150,7 +150,7 @@ inline void base_test_case(DetectionPostProcessLayerInfo info, DataType data_typ
         quantize_and_fill_tensor(Accessor(anchors), anchors_vector);
     }
 
-    // Determine the output through the Neon kernel
+    // Determine the output through the Compute Library operator
     Tensor                      output_boxes;
     Tensor                      output_classes;
     Tensor                      output_scores;
diff --git a/tests/validation/NEON/DilatedConvolutionLayer.cpp b/tests/validation/NEON/DilatedConvolutionLayer.cpp
index 2f0fce2ce0..fbfe8b8a7a 100644
--- a/tests/validation/NEON/DilatedConvolutionLayer.cpp
+++ b/tests/validation/NEON/DilatedConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/cpu/operators/CpuConv2d.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/DilatedConvolutionLayerDataset.h"
@@ -49,7 +50,7 @@ const AbsoluteTolerance<float>            abs_tolerance_f16(0.3f);
 const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f)); /**< Relative tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr float                           tolerance_num_f16 = 0.07f;                 /**< Tolerance number for FP16 */
 #endif                                                                               /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8(1);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
@@ -96,7 +97,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                           framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
                input_info, weights_info, output_info, conv_info, dilation, expected)
 {
-    ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
+    ConvolutionMethod is_valid = cpu::CpuConv2d::get_convolution_method(&input_info.clone()->set_is_resizable(false),
                                                                             &weights_info.clone()->set_is_resizable(false),
                                                                             &output_info.clone()->set_is_resizable(false),
                                                                             conv_info, WeightsInfo(), dilation);
@@ -161,13 +162,18 @@ template <typename T>
 using NEGEMMDilatedConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
 
 TEST_SUITE(Quantized)
+/// @note: Every asymmetric quantized test where there's no fused activation will have its quantization info ignored
+/// This is because instead of using the same quantization information for all the tensors, the fixture generates
+/// separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, we can remove the explicit
+/// quantization info.
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMDilatedConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
 {
     // Validate output
@@ -178,7 +184,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMDilatedConvolutionLayerQuantizedFixture<u
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
 {
     // Validate output
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index 6c47fa1cf8..0779c9d388 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,12 @@
  */
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuDirectConv2dKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -70,8 +73,8 @@ const auto data_pad_f16 = concat(combine(framework::dataset::make("PadX", { 0, 1
                                                  framework::dataset::make("KernelSize", 1))));
 
 const auto data_f32 = combine(datasets::SmallDirectConvolutionShapes(),
-                              combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
-                                      combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
+                              combine(framework::dataset::make("StrideX", { 1, 2, 3, 4 }),
+                                      combine(framework::dataset::make("StrideY", { 1, 2, 3, 4 }),
                                               data_pad_f32)));
 
 const auto data_f16 = combine(datasets::SmallDirectConvolutionShapes(),
@@ -87,17 +90,25 @@ const auto data_prec = combine(datasets::SmallDirectConvolutionShapes(),
                                                                framework::dataset::make("KernelSize", 3))))));
 
 const auto data9x9 = combine(datasets::SmallDirectConvolutionShapes(),
-                             combine(framework::dataset::make("StrideX", { 1 }),
-                                     combine(framework::dataset::make("StrideY", { 1 }),
+                             combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
+                                     combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
                                              combine(framework::dataset::make("PadX", { 0, 2 }),
                                                      combine(framework::dataset::make("PadY", { 0, 3 }),
                                                              framework::dataset::make("KernelSize", 9))))));
 
-const auto data_f32_nightly = combine(data_f32, framework::dataset::make("NumKernels", { 1, 4 }));
-const auto data_f16_nightly = combine(data_f16, framework::dataset::make("NumKernels", { 1, 4 }));
+const auto data8x8 = combine(datasets::SmallDirectConvolutionShapes(),
+                             combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
+                                     combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
+                                             combine(framework::dataset::make("PadX", { 0 }),
+                                                     combine(framework::dataset::make("PadY", { 0 }),
+                                                             framework::dataset::make("KernelSize", 8))))));
+
+const auto data_f32_nightly = combine(data_f32, framework::dataset::make("NumKernels", { 1, 4, 5 }));
+const auto data_f16_nightly = combine(data_f16, framework::dataset::make("NumKernels", { 1, 4, 5 }));
 
 const auto data_precommit    = combine(data_prec, framework::dataset::make("NumKernels", { 1 }));
 const auto data_precommit9x9 = combine(data9x9, framework::dataset::make("NumKernels", { 4 }));
+const auto data_precommit8x8 = combine(data8x8, framework::dataset::make("NumKernels", { 4 }));
 
 /* The following tests is from real use-case that made DirectConvolution
  * overflows in terms of its tensor indexing. This test case is using
@@ -129,17 +140,95 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
 TEST_SUITE(NEON)
 TEST_SUITE(DirectConvolutionLayer)
 
+/** Check whether the configuration of a Direct Convolution layer with no
+ * bias leads to a successful execution.
+ */
+TEST_CASE(NoBias, framework::DatasetMode::PRECOMMIT)
+{
+    const auto     src_shape     = TensorShape(27U, 13U, 2U);
+    const auto     weights_shape = TensorShape(3U, 3U, 2U, 4U);
+    const auto     bias_shape    = TensorShape(4U);
+    const auto     dst_shape     = TensorShape(25U, 11U, 4U);
+    constexpr auto dt            = DataType::F32;
+
+    auto src     = create_tensor<Tensor>(src_shape, dt);
+    auto weights = create_tensor<Tensor>(weights_shape, dt);
+    auto dst     = create_tensor<Tensor>(dst_shape, dt);
+
+    const auto conv_info = PadStrideInfo(1, 1, 0, 0);
+
+    // Create Direct Convolution function
+    NEDirectConvolutionLayer conv{};
+    conv.configure(&src, &weights, nullptr, &dst, conv_info);
+
+    src.allocator()->allocate();
+    weights.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    library->fill_tensor_value(Accessor(src), 1.f);
+    library->fill_tensor_value(Accessor(weights), 1.f);
+
+    conv.run();
+
+    // Compute reference to compare
+    SimpleTensor<float> ref_src{ src_shape, dt };
+    SimpleTensor<float> ref_weights{ weights_shape, dt };
+    SimpleTensor<float> ref_bias{ bias_shape, dt };
+    library->fill_tensor_value(ref_src, 1.f);
+    library->fill_tensor_value(ref_weights, 1.f);
+    // No bias
+    library->fill_tensor_value(ref_bias, 0.f);
+    auto ref_dst = reference::convolution_layer<float>(ref_src, ref_weights, ref_bias, dst_shape, conv_info);
+
+    validate(Accessor(dst), ref_dst);
+}
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               concat(combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                      framework::dataset::make("DataType", { DataType::F32 })),
+                              framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                      combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                      framework::dataset::make("DataType", { DataType::F16 })),
+                              framework::dataset::make("DataLayout", { DataLayout::NCHW }))),
+               cpu_ext, data_type, data_layout)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ data_type, data_layout, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string data_layout_str;
+    if(data_layout == DataLayout::NCHW)
+    {
+        data_layout_str = "nchw";
+    }
+    else
+    {
+        data_layout_str = "nhwc";
+    }
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_" + data_layout_str + "_directconv2d";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching data type input/weights
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching input feature maps
+        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid: Mismatching data type input/weights
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid: Mismatching input feature maps
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported kernel width
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Non-rectangular weights dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported non-rectangular weights dimensions
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid weights dimensions
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid stride
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid biases size
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid biases dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported stride
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported biases size
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported biases dimensions
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid output size
                                               }),
         framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F16),
@@ -185,7 +274,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
                                                        framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
 })),
         framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false })),
         input_info, weights_info, biases_info, output_info, conv_info, act_info, expected)
@@ -235,6 +331,8 @@ DATA_TEST_CASE(NoPaddingNHWCKernel, framework::DatasetMode::ALL, combine(combine
 
 template <typename T>
 using NEDirectConvolutionLayerFixture = DirectConvolutionValidationFixture<Tensor, Accessor, NEDirectConvolutionLayer, T>;
+template <typename T>
+using NEDirectConvolutionLayerMixedDataLayoutFixture = DirectConvolutionValidationFixture<Tensor, Accessor, NEDirectConvolutionLayer, T, true>;
 
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -266,6 +364,24 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolutionLayerFixture<float>, framewo
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEDirectConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::F32)),
+                       ActivationFunctionsDataset),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall8x8, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit8x8, framework::dataset::make("DataType",
+                                                                                                                       DataType::F32)),
+                                                                                                                       ActivationFunctionsDataset),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
 FIXTURE_DATA_TEST_CASE(RunSmall9x9, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit9x9, framework::dataset::make("DataType",
                                                                                                                        DataType::F32)),
                                                                                                                        ActivationFunctionsDataset),
diff --git a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
index 87f4c7f187..0667ac73f9 100644
--- a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
+++ b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,13 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
 } // namespace
 
 TEST_SUITE(NEON)
@@ -53,6 +60,9 @@ TEST_SUITE(AbsLayer)
 template <typename T>
 using NEAbsLayerFixture = AbsValidationFixture<Tensor, Accessor, NEAbsLayer, T>;
 
+template <typename T>
+using NEAbsLayerQuantizedFixture = AbsQuantizedValidationFixture<Tensor, Accessor, NEAbsLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -107,6 +117,32 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEAbsLayerFixture<int32_t>, framework::DatasetM
 TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.075, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // AbsLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/ElementwiseDivision.cpp b/tests/validation/NEON/ElementwiseDivision.cpp
index 3656560281..95db4ad5fd 100644
--- a/tests/validation/NEON/ElementwiseDivision.cpp
+++ b/tests/validation/NEON/ElementwiseDivision.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@ namespace validation
 namespace
 {
 RelativeTolerance<float> tolerance_fp32(0.000001f);
-AbsoluteTolerance<int>   tolerance_zero_s32(1); // Tolerance for S32 division
+AbsoluteTolerance<int>   tolerance_zero_s32(0); // Tolerance for S32 division
 
 /** Input data sets **/
 const auto ElementwiseDivisionS32Dataset = combine(combine(framework::dataset::make("DataType", DataType::S32),
@@ -56,6 +56,8 @@ const auto              ElementwiseDivisionFP16Dataset = combine(combine(framewo
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseDivisionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                     framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -93,10 +95,41 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
+// Test test cases will execute the function with dynamic-stated shapes
+// Since other elementwise operations share the same kernel, this tests are added only here.
+// Also, only FP32 is tested since data type doesn't/shouldn't matter with dynamic shapes.
+TEST_SUITE(DynamicShape)
+template <typename T>
+using CpuElementwiseDivisionDynamicShapeFixture = ArithmeticDivisionDynamicShapeValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
+
+template <typename T>
+using CpuElementwiseDivisionBroadcastDynamicShapeFixture = ArithmeticDivisionBroadcastDynamicShapeValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
+
+TEST_SUITE(F32)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuElementwiseDivisionDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset),
+                                                                                                                        InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CpuElementwiseDivisionBroadcastDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwiseDivisionFP32Dataset),
+                       OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+
+TEST_SUITE_END() // F32
+TEST_SUITE_END() // DynamicShape
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionFP16Dataset),
+                                                                                                          InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16, 0.01);
@@ -105,7 +138,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset),
+                                                                                                           InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -114,8 +148,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<float>, framework:
 template <typename T>
 using NEElementwiseDivisionBroadcastFixture = ArithmeticDivisionBroadcastValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseDivisionBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseDivisionFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseDivisionBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwiseDivisionFP32Dataset),
+                       OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseDivisionBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwiseDivisionFP32Dataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -125,7 +167,8 @@ TEST_SUITE_END() // Float
 
 TEST_SUITE(Integer)
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<int32_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionS32Dataset),
+                                                                                                             InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_zero_s32);
@@ -134,7 +177,7 @@ TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
 
 TEST_SUITE_END() // ElementwiseDivision
-TEST_SUITE_END() // Neon
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseExpLayer.cpp b/tests/validation/NEON/ElementwiseExpLayer.cpp
index 211e10fa45..31cd78626f 100644
--- a/tests/validation/NEON/ElementwiseExpLayer.cpp
+++ b/tests/validation/NEON/ElementwiseExpLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,15 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
+
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(ExpLayer)
@@ -53,6 +62,9 @@ TEST_SUITE(ExpLayer)
 template <typename T>
 using NEExpLayerFixture = ExpValidationFixture<Tensor, Accessor, NEExpLayer, T>;
 
+template <typename T>
+using NEExpLayerQuantizedFixture = ExpQuantizedValidationFixture<Tensor, Accessor, NEExpLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -82,6 +94,32 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEExpLayerFixture<float>, framework::DatasetMod
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEExpLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.01, 0) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.003, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEExpLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.02, -1) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.002, -2) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // ExpLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/ElementwiseKernelSelection.cpp b/tests/validation/NEON/ElementwiseKernelSelection.cpp
new file mode 100644
index 0000000000..7990a51936
--- /dev/null
+++ b/tests/validation/NEON/ElementwiseKernelSelection.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuElementwiseKernel.h"
+#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+TEST_SUITE(KernelSelection)
+
+DATA_TEST_CASE(KernelSelection_elementwise_unary, framework::DatasetMode::ALL, concat(
+                   combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                           framework::dataset::make("DataType", { DataType::F32,
+                                                                  DataType::F16,
+                                                                  DataType::S32
+                                                                })),
+                   combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                           framework::dataset::make("DataType", { DataType::F32,
+                                                                  DataType::F16,
+                                                                  DataType::S32
+                                                                }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_elementwise_unary";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(KernelSelection_elementwise_arithmetic, framework::DatasetMode::ALL, concat(concat(
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16,
+                                                                                                               DataType::QASYMM8,
+                                                                                                               DataType::QASYMM8_SIGNED
+                                                                                                                                            })),
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16
+                                                                                                                                            }))),
+                                                                                           combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                                                                                                   framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                                                                           DataType::QASYMM8_SIGNED
+                                                                                                                                        }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuArithmeticKernel::get_implementation(
+                                    ElementwiseDataTypeISASelectorData{ data_type, cpu_isa, static_cast<int>(ArithmeticOperation::ADD) },
+                                    cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_arithmetic";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(KernelSelection_elementwise_comparison, framework::DatasetMode::ALL, concat(concat(
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16,
+                                                                                                               DataType::U8,
+                                                                                                               DataType::QASYMM8,
+                                                                                                               DataType::QASYMM8_SIGNED
+                                                                                                                                            })),
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16,
+                                                                                                               DataType::U8
+                                                                                                                                            }))),
+                                                                                           combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                                                                                                   framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                                                                           DataType::QASYMM8_SIGNED
+                                                                                                                                        }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuComparisonKernel::get_implementation(
+                                    ElementwiseDataTypeISASelectorData{ data_type, cpu_isa, static_cast<int>(ComparisonOperation::Equal) },
+                                    cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_comparison";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END() // Neon
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseLog.cpp b/tests/validation/NEON/ElementwiseLog.cpp
index 3115ed6065..1175903dac 100644
--- a/tests/validation/NEON/ElementwiseLog.cpp
+++ b/tests/validation/NEON/ElementwiseLog.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,15 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
+
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(LogLayer)
@@ -53,6 +62,9 @@ TEST_SUITE(LogLayer)
 template <typename T>
 using NELogLayerFixture = LogValidationFixture<Tensor, Accessor, NELogLayer, T>;
 
+template <typename T>
+using NELogLayerQuantizedFixture = LogQuantizedValidationFixture<Tensor, Accessor, NELogLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -88,6 +100,33 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NELogLayerFixture<float>, framework::DatasetMod
 }
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(10.5, 0), QuantizationInfo(0.5, -10)  })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.75, -128) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(12.5, -2) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // LogLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/ElementwiseMax.cpp b/tests/validation/NEON/ElementwiseMax.cpp
index 4bc263184e..61421ab3e5 100644
--- a/tests/validation/NEON/ElementwiseMax.cpp
+++ b/tests/validation/NEON/ElementwiseMax.cpp
@@ -62,6 +62,8 @@ const auto ElementwiseMaxFP16Dataset = combine(combine(framework::dataset::make(
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseMaxFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -111,7 +113,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseMaxS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseMaxS32Dataset),
+                                                                                                              InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -119,7 +122,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int32_t>, framework::Da
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxS16Dataset),
+                                                                                                        InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -131,11 +135,12 @@ using NEElementwiseMaxQuantizedFixture = ElementwiseMaxValidationQuantizedFixtur
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMaxQASYMM8Dataset),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -144,11 +149,13 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<uint8_t>, fram
 template <typename T>
 using NEElementwiseMaxQuantizedBroadcastFixture = ElementwiseMaxQuantizedBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMax, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseMaxQASYMM8Dataset),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                               ElementwiseMaxQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -156,16 +163,26 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxQuantizedBroadcastFixt
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                       ElementwiseMaxQASYMM8SignedDataset),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, 20) })),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f, 0) })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })))
+                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })),
+                                                                                                                      OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallInPlace, NEElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                       ElementwiseMaxQASYMM8SignedDataset),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, -20) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, -20) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, -20) })),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-
 TEST_SUITE_END()
 
 TEST_SUITE_END()
@@ -173,7 +190,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxFP16Dataset),
+                                                                                                     InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -182,7 +200,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxFP32Dataset),
+                                                                                                      InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -190,8 +209,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<float>, framework::Data
 template <typename T>
 using NEElementwiseMaxBroadcastFixture = ElementwiseMaxBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMax, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                                                                                                                        ElementwiseMaxFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                                                                                                                        ElementwiseMaxFP32Dataset),
+                                                                                                                        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseMaxBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwiseMaxFP32Dataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
diff --git a/tests/validation/NEON/ElementwiseMin.cpp b/tests/validation/NEON/ElementwiseMin.cpp
index 3836b90308..a134eb354d 100644
--- a/tests/validation/NEON/ElementwiseMin.cpp
+++ b/tests/validation/NEON/ElementwiseMin.cpp
@@ -62,6 +62,8 @@ const auto ElementwiseMinFP16Dataset = combine(combine(framework::dataset::make(
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseMinFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -110,7 +112,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseMinS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseMinS32Dataset),
+                                                                                                              InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -118,7 +121,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int32_t>, framework::Da
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinS16Dataset),
+                                                                                                        InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -133,23 +137,34 @@ TEST_SUITE(QASYMM8)
 template <typename T>
 using NEElementwiseMinQuantizedBroadcastFixture = ElementwiseMinQuantizedBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMin, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseMinQASYMM8Dataset),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                               ElementwiseMinQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseMinQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                               ElementwiseMinQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 20) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 20) })),
+                               InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMinQASYMM8Dataset),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) }))
-
-                      )
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -157,11 +172,12 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<uint8_t>, fram
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                       ElementwiseMaxQASYMM8SignedDataset),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, 20) })),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f, 0) })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })))
+                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })),
+                                                                                                                      OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
@@ -174,7 +190,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinFP16Dataset),
+                                                                                                     InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -183,7 +200,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinFP32Dataset),
+                                                                                                      InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -192,8 +210,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<float>, framework::Data
 template <typename T>
 using NEElementwiseMinBroadcastFixture = ElementwiseMinBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMin, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                                                                                                                        ElementwiseMinFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                                                                                                                        ElementwiseMinFP32Dataset),
+                                                                                                                        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseMinBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwiseMinFP32Dataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
diff --git a/tests/validation/NEON/ElementwiseNegation.cpp b/tests/validation/NEON/ElementwiseNegation.cpp
index 629baa80e6..5b8ae8fc64 100644
--- a/tests/validation/NEON/ElementwiseNegation.cpp
+++ b/tests/validation/NEON/ElementwiseNegation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,13 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(NegLayer)
@@ -53,6 +60,9 @@ TEST_SUITE(NegLayer)
 template <typename T>
 using NENegLayerFixture = NegValidationInPlaceFixture<Tensor, Accessor, NENegLayer, T>;
 
+template <typename T>
+using NENegLayerQuantizedFixture = NegQuantizedValidationFixture<Tensor, Accessor, NENegLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -113,6 +123,32 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<int32_t>, framework::DatasetM
 TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.075, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // NegLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/ElementwisePower.cpp b/tests/validation/NEON/ElementwisePower.cpp
index 4305387c5f..9ac9eec280 100644
--- a/tests/validation/NEON/ElementwisePower.cpp
+++ b/tests/validation/NEON/ElementwisePower.cpp
@@ -51,6 +51,8 @@ const auto              ElementwisePowerFP16Dataset = combine(combine(framework:
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwisePowerFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                  framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -91,7 +93,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwisePowerFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwisePowerFP16Dataset),
+                                                                                                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16, 0.01);
@@ -101,13 +104,15 @@ TEST_SUITE_END() // F16
 
 TEST_SUITE(F32)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwisePowerFP32Dataset),
+                                                                                                        InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEElementwisePowerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEElementwisePowerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ElementwisePowerFP32Dataset),
+                                                                                                            InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -116,15 +121,23 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEElementwisePowerFixture<float>, framework::Da
 template <typename T>
 using NEElementwisePowerBroadcastFixture = ElementwisePowerBroadcastValidationFixture<Tensor, Accessor, NEElementwisePower, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                       ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwisePowerFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
 }
-
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapesBroadcast(),
-                       ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwisePowerFP32Dataset),
+                       InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
+                       ElementwisePowerFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
diff --git a/tests/validation/NEON/ElementwiseRound.cpp b/tests/validation/NEON/ElementwiseRound.cpp
index 5ff81a5d8a..620618cb0b 100644
--- a/tests/validation/NEON/ElementwiseRound.cpp
+++ b/tests/validation/NEON/ElementwiseRound.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -40,12 +40,20 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+} // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(RoundLayer)
 
 template <typename T>
 using NERoundLayerFixture = RoundValidationFixture<Tensor, Accessor, NERoundLayer, T>;
 
+template <typename T>
+using NERoundLayerQuantizedFixture = RoundQuantizedValidationFixture<Tensor, Accessor, NERoundLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -81,6 +89,33 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NERoundLayerFixture<float>, framework::DatasetM
 }
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERoundLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERoundLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.075, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // RoundLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
index f41500cc0b..80788c893f 100644
--- a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
+++ b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,13 +46,42 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(RsqrtLayer)
 
+// Test test cases will execute the function with dynamic-stated shapes
+// Since other elementwise unary operations share the same kernel, this tests are added only here.
+// Also, only FP32 is tested since data type doesn't/shouldn't matter with dynamic shapes.
+TEST_SUITE(DynamicShape)
+TEST_SUITE(FP32)
+
+template <typename T>
+using CpuRsqrtDynamicShapeFixture = RsqrtDynamicShapeValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuRsqrtDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                          DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // DynamicShape
+
 template <typename T>
 using NERsqrtLayerFixture = RsqrtValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
 
+template <typename T>
+using NERsqrtLayerQuantizedFixture = RsqrtQuantizedValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -83,6 +112,32 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NERsqrtLayerFixture<float>, framework::DatasetM
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERsqrtLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(20, 0) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERsqrtLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(25, -128) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // RsqrtLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/ElementwiseSin.cpp b/tests/validation/NEON/ElementwiseSin.cpp
index 9b212e264f..9c2d7ae268 100644
--- a/tests/validation/NEON/ElementwiseSin.cpp
+++ b/tests/validation/NEON/ElementwiseSin.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,8 @@ AbsoluteTolerance<float> tolerance_fp32(0.00001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 AbsoluteTolerance<float> tolerance_fp16(0.0005f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(SinLayer)
@@ -53,6 +55,9 @@ TEST_SUITE(SinLayer)
 template <typename T>
 using NESinLayerFixture = SinValidationFixture<Tensor, Accessor, NESinLayer, T>;
 
+template <typename T>
+using NESinLayerQuantizedFixture = SinQuantizedValidationFixture<Tensor, Accessor, NESinLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -89,6 +94,32 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESinLayerFixture<float>, framework::DatasetMod
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NESinLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(200, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NESinLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.07, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(123, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // SinLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/ElementwiseSquareDiff.cpp b/tests/validation/NEON/ElementwiseSquareDiff.cpp
index 069cbbd7fa..9a86b541de 100644
--- a/tests/validation/NEON/ElementwiseSquareDiff.cpp
+++ b/tests/validation/NEON/ElementwiseSquareDiff.cpp
@@ -68,6 +68,8 @@ const auto ElementwiseSquaredDiffFP16Dataset = combine(combine(framework::datase
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseSquaredDiffFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                        framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -109,7 +111,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseSquaredDiffS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffS32Dataset),
+                                                                                                                      InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -117,7 +120,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, frame
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffS16Dataset),
+                                                                                                                InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -129,13 +133,12 @@ using NEElementwiseSquaredDiffQuantizedFixture = ElementwiseSquaredDiffValidatio
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQASYMM8Dataset),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) }))
-
-                      )
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -144,11 +147,23 @@ template <typename T>
 using NEElementwiseSquaredDiffQuantizedBroadcastFixture = ElementwiseSquaredDiffQuantizedBroadcastValidationFixture<Tensor, Accessor, NEElementwiseSquaredDiff, T>;
 
 FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                                                       ElementwiseSquaredDiffQASYMM8Dataset),
+                       combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                               ElementwiseSquaredDiffQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseSquaredDiffQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                               ElementwiseSquaredDiffQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -156,11 +171,12 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffQuantizedBroad
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQASYMM8SignedDataset),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f, 5) })),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(.5f, 5) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(.2f, 5) })))
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(.2f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -171,7 +187,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP16Dataset),
+                                                                                                             InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16, 0.01);
@@ -180,7 +197,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP32Dataset),
+                                                                                                              InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -188,15 +206,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<float>, framewo
 template <typename T>
 using NEElementwiseSquaredDiffBroadcastFixture = ElementwiseSquaredDiffBroadcastValidationFixture<Tensor, Accessor, NEElementwiseSquaredDiff, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseSquaredDiffFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwiseSquaredDiffFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapesBroadcast(),
-                       ElementwiseSquaredDiffFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
+                       ElementwiseSquaredDiffFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
diff --git a/tests/validation/NEON/FFT.cpp b/tests/validation/NEON/FFT.cpp
index 7125158a21..f7ef0a314e 100644
--- a/tests/validation/NEON/FFT.cpp
+++ b/tests/validation/NEON/FFT.cpp
@@ -158,6 +158,8 @@ TEST_SUITE(FFTConvolutionLayer)
 
 template <typename T>
 using NEFFTConvolutionLayerFixture = FFTConvolutionValidationFixture<Tensor, Accessor, NEFFTConvolutionLayer, T>;
+template <typename T>
+using NEFFTConvolutionLayerMixedDataLayoutFixture = FFTConvolutionValidationFixture<Tensor, Accessor, NEFFTConvolutionLayer, T, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
@@ -169,10 +171,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEFFTConvolutionLayerFixture<float>, framework:
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32, tolerance_num);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFFTConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFFTConvolutionLayerDataset(),
+                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                 ActivationFunctionsSmallDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32, tolerance_num);
+}
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // FFTConvolutionLayer
-
 TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/NEON/FillBorder.cpp b/tests/validation/NEON/FillBorder.cpp
index 343ad831e4..928990b2b4 100644
--- a/tests/validation/NEON/FillBorder.cpp
+++ b/tests/validation/NEON/FillBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,10 +60,10 @@ DATA_TEST_CASE(FillBorder, framework::DatasetMode::ALL, combine(combine(combine(
 {
     BorderSize border_size{ static_cast<unsigned int>(size) };
 
-    std::mt19937                           generator(library->seed());
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    const uint8_t                          border_value = distribution_u8(generator);
-    const uint8_t                          tensor_value = distribution_u8(generator);
+    std::mt19937                            generator(library->seed());
+    std::uniform_int_distribution<uint32_t> distribution_u8(0, 255);
+    const uint8_t                           border_value = distribution_u8(generator);
+    const uint8_t                           tensor_value = distribution_u8(generator);
 
     // Create tensors
     Tensor src = create_tensor<Tensor>(shape, data_type);
@@ -77,7 +77,7 @@ DATA_TEST_CASE(FillBorder, framework::DatasetMode::ALL, combine(combine(combine(
     validate(src.info()->padding(), padding);
 
     // Fill tensor with constant value
-    std::uniform_int_distribution<uint8_t> distribution{ tensor_value, tensor_value };
+    std::uniform_int_distribution<uint32_t> distribution{ tensor_value, tensor_value };
     library->fill(Accessor(src), distribution, 0);
 
     // Create and configure kernel
diff --git a/tests/validation/NEON/Floor.cpp b/tests/validation/NEON/Floor.cpp
index 419ce56e44..3cd1033ef9 100644
--- a/tests/validation/NEON/Floor.cpp
+++ b/tests/validation/NEON/Floor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuFloorKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -62,6 +65,30 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
     const Status status = NEFloor::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false));
     ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
 }
+
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                            })),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuFloorKernel::get_implementation(DataTypeISASelectorData{data_type, cpu_isa}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_floor";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/NEON/FullyConnectedLayer.cpp b/tests/validation/NEON/FullyConnectedLayer.cpp
index d8c2203802..ee7e56227d 100644
--- a/tests/validation/NEON/FullyConnectedLayer.cpp
+++ b/tests/validation/NEON/FullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,8 @@
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuFullyConnected.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/FullyConnectedLayerDataset.h"
@@ -40,6 +42,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -56,7 +59,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -64,18 +67,25 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto FullyConnectedParameters = combine(framework::dataset::make("TransposeWeights", { false, true }), framework::dataset::make("ReshapeWeights", { false, true }));
+const auto FullyConnectedParameters = combine(make("TransposeWeights", { false, true }), make("ReshapeWeights", { false, true }));
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(1.f / 256.f, 10),
     QuantizationInfo(1.1f, 10),
 });
-const auto EmptyActivationFunctionDataset = framework::dataset::make("ActivationInfo",
+
+const auto IgnoredQuantizationData = make("IgnoredQuantizationInfo",
+{
+    QuantizationInfo(),
+});
+
+const auto NoActivationFunctionDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
 });
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
@@ -83,7 +93,7 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
 });
 
-const auto ActivationFunctionsQuantizedDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsQuantizedDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
@@ -94,40 +104,183 @@ const auto ActivationFunctionsQuantizedDataset = framework::dataset::make("Activ
 TEST_SUITE(NEON)
 TEST_SUITE(FullyConnectedLayer)
 
+/** Test case for memory injection in @ref cpu::CpuFullyConnected.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       fc          = std::make_unique<cpu::CpuFullyConnected>();
+    const auto src_info    = TensorInfo(TensorShape(8U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(8U, 4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto fc_info     = FullyConnectedLayerInfo{};
+    fc->configure(&src_info, &weight_info, &bias_info, &dst_info, fc_info);
+
+    // telhs are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(fc->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        // This operator is configured once and captured by this lambda.
+        fc->prepare(prep_pack);
+        fc->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEFullyConnectedLayer.
+ *
+ * Make sure @ref NEFullyConnectedLayer still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto       fc          = std::make_unique<NEFullyConnectedLayer>();
+    const auto src_info    = TensorInfo(TensorShape(8U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(8U, 4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto fc_info     = FullyConnectedLayerInfo{};
+    auto       run_conv    = [&]()
+    {
+        auto src    = create_tensor<Tensor>(src_info);
+        auto weight = create_tensor<Tensor>(weight_info);
+        auto bias   = create_tensor<Tensor>(bias_info);
+        auto dst    = create_tensor<Tensor>(dst_info);
+        fc->configure(&src, &weight, &bias, &dst, fc_info);
+        src.allocator()->allocate();
+        weight.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        fc->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Unit test for @ref cpu::CpuFullyConnected with quantized multipler > 1
+ *
+ * Tests output correctness.
+ */
+TEST_CASE(Quant8_Signed_Mult_gt_1, framework::DatasetMode::ALL)
+{
+    auto       fc          = std::make_unique<cpu::CpuFullyConnected>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 3U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(0.5f, -1));
+    const auto weight_info = TensorInfo(TensorShape(1U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(0.5, -8));
+    const auto bias_info   = TensorInfo(TensorShape(1U), 1, DataType::S32);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 3U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(0.1f, 0));
+    const auto fc_info     = FullyConnectedLayerInfo{};
+    fc->configure(&src_info, &weight_info, &bias_info, &dst_info, fc_info);
+
+    // telhs are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    auto dst    = create_tensor<Tensor>(dst_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias }, { TensorType::ACL_DST, &dst } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(fc->workspace(), mg, run_pack, prep_pack);
+
+    // Initialize input values
+    const std::vector<int8_t>  src_values    = { 3, 63, 31 };
+    const std::vector<int8_t>  weight_values = { -4 };
+    const std::vector<int32_t> bias_values   = { 16 };
+    const std::vector<int32_t> expected      = { 80, 127, 127 };
+    library->fill_static_values(Accessor(src), src_values);
+    library->fill_static_values(Accessor(weight), weight_values);
+    library->fill_static_values(Accessor(bias), bias_values);
+
+    // Run FC layer
+    fc->prepare(prep_pack);
+    fc->run(run_pack);
+
+    auto dst_ptr = reinterpret_cast<int8_t *>(dst.buffer());
+    for(size_t i = 0; i < dst.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(dst_ptr[i] == expected[i], framework::LogLevel::ERRORS);
+    }
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
+    make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Invalid weights dimensions
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Wrongly reshaped weights
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                           }),
-    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
+    make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 315U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 315U), 1, DataType::F32),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
+    make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
+    make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                            })),
-    framework::dataset::make("TransposeWeights",{ true, true, false, true, true, true })),
-    framework::dataset::make("ReshapedWeights",{ false, false, false, false, false , false})),
-    framework::dataset::make("Expected", { false, true, true, false, false, true })),
+    make("TransposeWeights",{ true, true, false, true, true, true })),
+    make("ReshapedWeights",{ false, false, false, false, false , false})),
+    make("Expected", { false, true, true, false, false, true })),
     input_info, weights_info, bias_info, output_info, transpose_weights, reshaped_weights, expected)
 {
     // Create Fully Connected layer info
@@ -143,130 +296,251 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
 
 template <typename T>
 using NEFullyConnectedLayerFixture = FullyConnectedLayerValidationFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
+template <typename T>
+using NEFullyConnectedLayerMixedDataLayoutFixture = FullyConnectedLayerValidationFixture<Tensor, Accessor, NEFullyConnectedLayer, T, true>;
+template <typename T>
+using NEFullyConnectedLayerDynamicWeightsFixture = FullyConnectedWithDynamicWeightsFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
+template <typename T>
+using NEFullyConnectedLayerDynamicBiasFixture = FullyConnectedWithDynamicBiasFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
 
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                                                                                                                        FullyConnectedParameters),
-                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                        FullyConnectedParameters,
+                                                                                                                        make("DataType", DataType::F16),
+                                                                                                                NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::F16)),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::F16),
                        ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(),
-                                                                                                                      FullyConnectedParameters),
-                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
-                                                                                                              EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(),
+                                                                                                                      FullyConnectedParameters,
+                                                                                                                      make("DataType", DataType::F16),
+                                                                                                              NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F16),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false, true })))
+{
+}
 TEST_SUITE_END()
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                 EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                 make("DataType", DataType::F32),
+                                                                                                                 NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(
-                           combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::F32)),
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(
+                           make("Input", TensorShape(9U, 5U, 7U)),
+                           make("Weights", TensorShape(315U, 271U)),
+                       make("Biases", TensorShape(271U)),
+                       make("Output", TensorShape(271U)),
+                       FullyConnectedParameters,
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::FullyConnectedLayerWithActivationDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::F32),
                        ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
-                                                                                                               EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                       make("DataType", DataType::F32),
+                                                                                                               NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false, true })))
+{
+}
 TEST_SUITE_END()
 TEST_SUITE_END()
 
 template <typename T>
 using NEFullyConnectedLayerQuantizedFixture = FullyConnectedLayerValidationQuantizedFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
+template <typename T>
+using NEFullyConnectedLayerQuantizedMixedDataLayoutFixture = FullyConnectedLayerValidationQuantizedFixture<Tensor, Accessor, NEFullyConnectedLayer, T, true>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
-                           combine(datasets::SmallFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8),
+                       QuantizationData,
                        ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeightsWithActivation, NEFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false })))
+{
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicBiasWithActivation, NEFullyConnectedLayerDynamicBiasFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+}
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(
-                           combine(datasets::LargeFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+// Dynamic Quantization Tests here
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                           combine(datasets::SmallFullyConnectedLayerDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8),
+                       IgnoredQuantizationData,
+                       NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END()
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(
+                           datasets::LargeFullyConnectedLayerDataset(),
+                            FullyConnectedParameters,
+                           framework::dataset::make("DataType", DataType::QASYMM8),
+                       QuantizationData,
+                       NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicBias, NEFullyConnectedLayerDynamicBiasFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       NoActivationFunctionDataset))
+{
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       IgnoredQuantizationData,
+                               NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       NoActivationFunctionDataset,
+                       make("WeightsReshaped", { false })))
+{
+}
+TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
-                           combine(datasets::SmallFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       QuantizationData),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8_SIGNED),
+                       QuantizationData,
                        ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+FIXTURE_DATA_TEST_CASE(RunDynamicWeightsWithActivation, NEFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false })))
+{
+}
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+// Dynamic Quantization tests
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(
+                           datasets::SmallFullyConnectedLayerDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8_SIGNED),
+                       IgnoredQuantizationData,
+                       NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       QuantizationData,
+                               NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       NoActivationFunctionDataset,
+                       make("WeightsReshaped", { false })))
+{
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // FullyConnectedLayer
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
index 2d8c61164b..5f6a402204 100644
--- a/tests/validation/NEON/GEMM.cpp
+++ b/tests/validation/NEON/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,12 +22,15 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
+#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
+#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
+#include "src/cpu/operators/CpuGemm.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -48,6 +51,8 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 namespace
 {
 constexpr AbsoluteTolerance<float> tolerance_f(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for FP32 data types */
@@ -57,7 +62,7 @@ const AbsoluteTolerance<float>      abs_tolerance_f16(0.2f);      /**< Absolute
 constexpr float                     tolerance_num = 0.07f;        /**< Tolerance number for FP16 data types */
 #endif                                                            /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -65,62 +70,210 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12);
-const auto data_transpose  = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14);
+const auto data_interleave = make("M", 8, 12) * make("N", 8, 12);
+const auto data_transpose  = make("M", 8, 14) * make("N", 7, 14);
 
 /** Zero padding test */
 template <typename FunctionType>
 bool validate_zero_padding(unsigned int dim0_value, unsigned int dim1_value)
 {
     const TensorShape in_shape(dim0_value, dim1_value);
+    TensorInfo        in(in_shape, 1, DataType::U32);
+    TensorInfo        dst;
 
-    // Create tensors
-    Tensor in = create_tensor<Tensor>(in_shape, DataType::U32);
-    Tensor dst;
-
-    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(in.is_resizable(), framework::LogLevel::ERRORS);
 
     // Validate zero-padding
     FunctionType func;
 
     func.configure(&in, &dst);
 
-    return in.info()->padding().empty();
+    return in.padding().empty();
 }
 
 /* Zero padding test for GEMM kernels */
 bool validate_gemm_zero_padding(const TensorShape shape0, const TensorShape shape1)
 {
     // Create tensors
-    Tensor in0 = create_tensor<Tensor>(shape0, DataType::F32);
-    Tensor in1 = create_tensor<Tensor>(shape1, DataType::F32);
-    Tensor dst;
+    TensorInfo in0(shape0, 1, DataType::F32);
+    TensorInfo in1(shape1, 1, DataType::F32);
+    TensorInfo dst;
 
     // Validate zero-padding
-    NEGEMMMatrixMultiplyKernel gemm;
+    cpu::kernels::CpuGemmMatrixMultiplyKernel gemm;
     gemm.configure(&in0, &in1, &dst, 1.0, false);
 
-    return in0.info()->padding().empty() && in1.info()->padding().empty() && dst.info()->padding().empty();
+    return in0.padding().empty() && in1.padding().empty() && dst.padding().empty();
 }
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(GEMM)
 
+/** Test case for memory injection in @ref cpu::CpuGemm.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       gemm      = std::make_unique<cpu::CpuGemm>();
+    const auto lhs_info  = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32);
+    const auto rhs_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto c_info    = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    auto       dst_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto gemm_info = GEMMInfo{};
+    gemm->configure(&lhs_info, &rhs_info, &c_info, &dst_info, 1.f, 1.f, gemm_info);
+
+    // telhs are newly created every call of this lambda function
+    auto lhs = create_tensor<Tensor>(lhs_info);
+    auto rhs = create_tensor<Tensor>(rhs_info);
+    auto c   = create_tensor<Tensor>(c_info);
+    lhs.allocator()->allocate();
+    rhs.allocator()->allocate();
+    c.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &lhs }, { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(lhs), 1.f);
+        library->fill_tensor_value(Accessor(rhs), 2.f);
+        library->fill_tensor_value(Accessor(c), 3.f);
+        // This operator is configured once and captured by this lambda.
+        gemm->prepare(prep_pack);
+        gemm->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEGEMM.
+ *
+ * Make sure @ref NEGEMM still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto       gemm      = std::make_unique<NEGEMM>();
+    const auto lhs_info  = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32);
+    const auto rhs_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto c_info    = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    auto       dst_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto gemm_info = GEMMInfo{};
+    auto       run_conv  = [&]()
+    {
+        auto lhs = create_tensor<Tensor>(lhs_info);
+        auto rhs = create_tensor<Tensor>(rhs_info);
+        auto c   = create_tensor<Tensor>(c_info);
+        auto dst = create_tensor<Tensor>(dst_info);
+        gemm->configure(&lhs, &rhs, &c, &dst, 1.f, 1.f, gemm_info);
+        lhs.allocator()->allocate();
+        rhs.allocator()->allocate();
+        c.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(lhs), 1.f);
+        library->fill_tensor_value(Accessor(rhs), 2.f);
+        library->fill_tensor_value(Accessor(c), 3.f);
+        gemm->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+               make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type
+                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),
+                                                     }),
+               make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32),
+                                                        TensorInfo(TensorShape(8U, 27U), 1, DataType::F32),
+                                                     })),
+               make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32),
+                                                        TensorInfo(TensorShape(8U, 13U), 1, DataType::F32),
+                                                     })),
+               make("Expected", { false, true })),
+               lhs_info, rhs_info, output_info, expected)
+{
+    constexpr float alpha = 1.0;
+    constexpr float beta = 0.0;
+    const auto gemm_info = GEMMInfo();
+    bool is_valid = bool(NEGEMM::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), alpha, beta, gemm_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE(KERNEL_SELECTION)
+DATA_TEST_CASE(KernelSelection_mul_and_add, framework::DatasetMode::ALL,
+               combine(make("CpuExt", std::string("NEON")),
+                       make("DataType", { DataType::F32,
+                                                              DataType::F16
+                                                            })),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl_mul = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_mul);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_mul";
+    std::string actual   = selected_impl_mul->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+
+    const auto *selected_impl_add = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_add);
+
+    expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_add";
+    actual   = selected_impl_add->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // KERNEL_SELECTION
+
 TEST_SUITE(TRANSPOSE_1XW)
-using NEGEMMTranspose1xW = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMTranspose1xWKernel, 4>;
+using CpuGemmTranspose1xW = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmTranspose1xWKernel>;
 DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
-                   framework::dataset::make("N", { 1, 23, 63, 101 }),
-                   framework::dataset::make("K", { 1, 47, 29, 27 })),
+                   make("N", { 1, 23, 63, 101 }),
+                   make("K", { 1, 47, 29, 27 })),
                n_value, k_value)
 {
-    bool status = validate_zero_padding<NEGEMMTranspose1xWKernel>(n_value, k_value);
+    bool status = validate_zero_padding<CpuGemmTranspose1xW>(n_value, k_value);
     ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
 }
 
 TEST_SUITE(U32)
-using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint32_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U32))
+using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -128,8 +281,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMo
 TEST_SUITE_END() // U32
 
 TEST_SUITE(U16)
-using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint16_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U16))
+using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U16))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -137,8 +290,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMo
 TEST_SUITE_END() // U16
 
 TEST_SUITE(U8)
-using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint8_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U8))
+using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U8))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -148,20 +301,20 @@ TEST_SUITE_END() // U8
 TEST_SUITE_END() // TRANSPOSE_1XW
 
 TEST_SUITE(INTERLEAVE_4X4)
-using NEGEMMInterleave4x4 = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMInterleave4x4Kernel, 4>;
+using CpuGemmInterleave4x4 = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmInterleave4x4Kernel>;
 
 DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
-                   framework::dataset::make("M", { 1, 23, 63, 101 }),
-                   framework::dataset::make("K", { 1, 47, 29, 27 })),
+                   make("M", { 1, 23, 63, 101 }),
+                   make("K", { 1, 47, 29, 27 })),
                m_value, k_value)
 {
-    bool status = validate_zero_padding<NEGEMMInterleave4x4Kernel>(m_value, k_value);
+    bool status = validate_zero_padding<cpu::kernels::CpuGemmInterleave4x4Kernel>(m_value, k_value);
     ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
 }
 
 TEST_SUITE(U32)
-using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint32_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U32))
+using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -169,8 +322,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetM
 TEST_SUITE_END() // U32
 
 TEST_SUITE(U16)
-using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint16_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U16))
+using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::U16))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -178,8 +331,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetM
 TEST_SUITE_END() // U16
 
 TEST_SUITE(U8)
-using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint8_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::QASYMM8))
+using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::QASYMM8))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -188,23 +341,24 @@ TEST_SUITE_END() // U8
 
 TEST_SUITE_END() // INTERLEAVE_4X4
 
-//TODO(COMPMID-415): Validate valid region
-
 template <typename T>
 using NEGEMMFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T>;
 
 template <typename T>
-using NEGEMMFixtureDisabledC = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T, true>;
+using NEBatchedMatMulFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T, true, false, false, false, false, true>;
+
+template <typename T>
+using NEGEMMAccumulateFixture = GEMMAccumulateValidationFixture<Tensor, Accessor, NEGEMM, T>;
 
 TEST_SUITE(Float)
-DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(framework::dataset::make("In0", { TensorShape(21U, 13U),
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(make("In0", { TensorShape(21U, 13U),
                                                                                                        TensorShape(31U, 1U),
                                                                                                        TensorShape(31U, 1U),
                                                                                                        TensorShape(8U, 2U),
                                                                                                        TensorShape(38U, 12U),
                                                                                                        TensorShape(32U, 1U)
                                                                                                      }),
-                                                                     framework::dataset::make("In1", { TensorShape(33U, 21U),
+                                                                     make("In1", { TensorShape(33U, 21U),
                                                                                                        TensorShape(23U, 31U),
                                                                                                        TensorShape(23U, 31U),
                                                                                                        TensorShape(16U, 8U),
@@ -217,59 +371,111 @@ DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(framework::
     ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
 }
 
+DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine(
+                                                                     zip(make("In0",{ TensorShape(21U, 13U) }),
+                                                                     make("In1", { TensorShape(33U, 21U) }),
+                                                                     make("Dst", { TensorShape(33U, 13U) })),
+                                                                     zip(
+                                                                     make("alpha", { 1.0, 100.0, 1.0, 1.0 }),
+                                                                     make("beta", { 0.0, 0.0, 1.0, 1.0 }),
+                                                                     make("is_c_null", { false, false, false, true }),
+                                                                     make("Expected", { true, false, false, true }))),
+               shape_a, shape_b, shape_dst, alpha, beta, is_c_null, expected)
+{
+    /* Accumulation test for GEMM kernels */
+    // Create tensors
+    TensorInfo in_a(shape_a, 1, DataType::F32);
+    TensorInfo in_b(shape_b, 1, DataType::F32);
+    TensorInfo in_c(shape_dst, 1, DataType::F32);
+    TensorInfo dst(shape_dst, 1, DataType::F32);
+
+    GEMMInfo gemm_info = GEMMInfo();
+    gemm_info.set_accumulate(true);
+
+    // Validate accumulation
+    cpu::CpuGemm gemm;
+    Status status = gemm.validate(&in_a, &in_b, (is_c_null ? nullptr : &in_c), &dst, alpha, beta, gemm_info);
+    ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                         framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                 framework::dataset::make("DataType", DataType::F16)))
+                                                                                                         make("ReshapeWeights", { true, false })),
+                                                                                                 make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
-                                                                                                       framework::dataset::make("ReshapeWeights", { true, false })),
+                                                                                                       make("ReshapeWeights", { true, false })),
+                                                                                               make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
+}
 
-                                                                                               framework::dataset::make("DataType", DataType::F16)))
+TEST_SUITE(BATCHED_MATMUL)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
+                                                                                                                  make("ReshapeWeights", { false })),
+                                                                                                          make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // BATCHED_MATMUL
+
+TEST_SUITE_END() // FP16
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                          framework::dataset::make("ReshapeWeights", { true, false })),
-
-                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+                                                                                                          make("ReshapeWeights", { true, false })),
+                                                                                                  make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
-                                                                                                        framework::dataset::make("ReshapeWeights", { true, false })),
+                                                                                                        make("ReshapeWeights", { true, false })),
+                                                                                                make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f);
+}
 
-                                                                                                framework::dataset::make("DataType", DataType::F32)))
+TEST_SUITE(BATCHED_MATMUL)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
+                                                                                                                   make("ReshapeWeights", { false })),
+                                                                                                           make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-TEST_SUITE(DisabledC)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixtureDisabledC<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true, false })),
+TEST_SUITE_END() // BATCHED_MATMUL
 
-                                                                                                           framework::dataset::make("DataType", DataType::F32)))
+TEST_SUITE(ACCUMULATE)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAccumulateFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallAccumulateGEMMDataset(),
+                                                                                                        make("ReshapeWeights", { false }),
+                                                                                                        make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMAccumulateFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeAccumulateGEMMDataset(),
+                                                                                                        make("ReshapeWeights", { false }),
+                                                                                                        make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // ACCUMULATE
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // GEMM
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 518f4804a0..d25f43a330 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -37,7 +39,6 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/GEMMLowpAssemblyFixture.h"
 #include "tests/validation/fixtures/GEMMLowpFixture.h"
 
 namespace arm_compute
@@ -46,10 +47,26 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
+namespace
+{
+    constexpr AbsoluteTolerance<float> tolerance_batched(1);
+    constexpr AbsoluteTolerance<float> tolerance_quant(1);
+} // namespace
+
+
 TEST_SUITE(NEON)
 TEST_SUITE(GEMMLowp)
 TEST_SUITE(MatrixMultiplyCore)
+
 using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpMatrixMultiplyCoreAccumulateFixture = GEMMLowpMatrixMultiplyAccumulateValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpBatchedMatMulFixture      = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, true>;
+using NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture = GEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpDequantizedMatrixMultiplyValidationFixture = GEMMLowpDequantizedMatrixMultiplyValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+
+using framework::dataset::make;
 
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),
                shape_a, shape_b, shape_c, a_offset, b_offset)
@@ -75,29 +92,69 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::c
     validate(b.info()->padding(), PaddingSize());
     validate(c.info()->padding(), PaddingSize());
 }
+// accumulation is not supported for Int8/UInt8 in aarch32
+#ifdef __aarch64__
+DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine(
+                                                                    zip(
+                                                                     make("In0",{ TensorShape(21U, 1U) }),
+                                                                     make("In1", { TensorShape(1U, 21U) }),
+                                                                     make("Dst", { TensorShape(1U, 1U) }),
+                                                                     make("a_offset", { -2 }),
+                                                                     make("a_offset", { 13 })
+                                                                    ),
+                                                                    zip(
+                                                                     make("OutputDataType", {  DataType::S32,  DataType::QASYMM8, DataType::QASYMM8_SIGNED}),
+                                                                     make("Expected", { true, false, false })
+                                                                    )),
+               shape_a, shape_b, shape_dst, a_offset, b_offset, output_data_type, expected)
+{
+    DataType input_data_type = (output_data_type == DataType::S32 ? DataType::QASYMM8 : output_data_type);
+    // Accumulation test for GEMM kernels
+    TensorInfo a(shape_a, 1, input_data_type, QuantizationInfo(1.0f / 255, a_offset));
+    TensorInfo b(shape_b, 1, input_data_type, QuantizationInfo(1.0f / 255, b_offset));
+    TensorInfo dst(shape_dst, 1, output_data_type, QuantizationInfo());
+
+    // Create and configure function
+    GEMMInfo gemm_info = GEMMInfo();
+    gemm_info.set_accumulate(true);
+
+    if (is_data_type_quantized(output_data_type))
+    {
+        GEMMLowpOutputStageInfo gemmLowpOutputStageInfo = GEMMLowpOutputStageInfo();
+        gemmLowpOutputStageInfo.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+
+        gemm_info.set_gemmlowp_output_stage(gemmLowpOutputStageInfo);
+    }
+
+    cpu::CpuGemmLowpMatrixMultiplyCore gemmlowp_mm;
+    Status status = gemmlowp_mm.validate(&a, &b, nullptr, &dst, gemm_info);
+
+    ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
+}
+#endif // __arch64__
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4
                                              TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),                                 // Mismatching data type
                                              TensorInfo(TensorShape(20U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
                                              TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
                                              TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)),
                                           }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+    make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
+                                          }),
+    make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(8U, 11U), 1, DataType::S32),
                                             TensorInfo(TensorShape(64U, 32U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Expected", { true, false, false, false, true })),
+                                           }),
+    make("Expected", { true, false, false, false, true })),
     a_info, b_info, output_info, expected)
 {
     // Lock tensors
@@ -110,444 +167,224 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+/** Test case for memory injection in @ref cpu::CpuGemmLowpMatrixMultiplyCore.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
 {
-    // Validate output
-    validate(Accessor(_target), _reference);
+    auto gemm     = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+    auto a_info   = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);
+    auto b_info   = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);
+    auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);
+    a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));
+    b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));
+    const auto gemm_info = GEMMInfo{};
+    gemm->configure(&a_info, &b_info, nullptr, &dst_info, gemm_info);
+
+    // telhs are newly created every call of this lambda function
+    auto a   = create_tensor<Tensor>(a_info);
+    auto b   = create_tensor<Tensor>(b_info);
+    auto dst = create_tensor<Tensor>(dst_info);
+    a.allocator()->allocate();
+    b.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    ITensorPack run_pack =
+    {
+        { TensorType::ACL_SRC_0, &a },
+        { TensorType::ACL_SRC_1, &b },
+        { TensorType::ACL_DST, &dst }
+    };
+    ITensorPack prep_pack =
+    {
+        { TensorType::ACL_SRC_1, &b },
+    };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));
+        library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));
+        // This operator is configured once and captured by this lambda.
+        gemm->prepare(prep_pack);
+        gemm->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+/** Test case for memory injection in @ref NEGEMMLowpMatrixMultiplyCore.
+ *
+ * Make sure @ref NEGEMMLowpMatrixMultiplyCore still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
 {
-    // Validate output
-    validate(Accessor(_target), _reference);
+    auto gemm     = std::make_unique<NEGEMMLowpMatrixMultiplyCore>();
+    auto a_info   = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);
+    auto b_info   = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);
+    auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);
+    a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));
+    b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));
+    const auto gemm_info = GEMMInfo{};
+    auto       run_conv  = [&]()
+    {
+        auto a   = create_tensor<Tensor>(a_info);
+        auto b   = create_tensor<Tensor>(b_info);
+        auto dst = create_tensor<Tensor>(dst_info);
+        gemm->configure(&a, &b, nullptr, &dst, gemm_info);
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));
+        library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));
+        gemm->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
 }
 
-using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
-TEST_SUITE(FusedOffsetOutput)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // FusedOffsetOutput
-TEST_SUITE_END() // MatrixMultiplyCore
-
-TEST_SUITE(OutputStage)
-
-TEST_SUITE(QuantizeDownInt32Scale)
 
+TEST_SUITE(BatchedMatMul)
 TEST_SUITE(QASYMM8)
-
-const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
-                                                      3)
-                                                      * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
-                                                           2)
-                                                           * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<Tensor, Accessor, NEGEMMLowpOutputStage>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        0,
-                                            13,
-                                           })),
-    framework::dataset::make("Max",{        205,
-                                            180,
-                                           })),
-    framework::dataset::make("Expected", { true, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
-
-    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
-    output_stage.type        = GEMMLowpOutputStageType::QUANTIZE_DOWN;
-    output_stage.gemmlowp_min_bound        = min;
-    output_stage.gemmlowp_max_bound        = max;
-    output_stage.output_data_type = DataType::QASYMM8;
-
-    // Lock tensors
-    Status status =  NEGEMMLowpOutputStage::validate(&a_info.clone()->set_is_resizable(false),
-                                                                     &b_info.clone()->set_is_resizable(false),
-                                                                     &output_info.clone()->set_is_resizable(false),
-                                                                     output_stage);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_CASE(NoPaddingAdded, framework::DatasetMode::PRECOMMIT)
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
-    Tensor input1 = create_tensor<Tensor>(TensorShape(21U, 13U), DataType::S32);
-    Tensor input2 = create_tensor<Tensor>(TensorShape(21U, 1U), DataType::S32);
-    Tensor output = create_tensor<Tensor>(TensorShape(21U, 13U), DataType::QASYMM8);
-
-    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
-    output_stage.type                    = GEMMLowpOutputStageType::QUANTIZE_DOWN;
-    output_stage.gemmlowp_min_bound      = 0;
-    output_stage.gemmlowp_max_bound      = 205;
-    output_stage.output_data_type        = DataType::QASYMM8;
-
-    NEGEMMLowpOutputStage f;
-    f.configure(&input1, &input2, &output, output_stage);
-
-    // Validate padding is zero
-    validate(input1.info()->padding(), PaddingSize());
-    validate(input2.info()->padding(), PaddingSize());
-    validate(output.info()->padding(), PaddingSize());
+    validate(Accessor(_target), _reference, tolerance_batched);
 }
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END() // BoundedReLu
-
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-
-const auto quantize_down_int32_to_int8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
-                                                     3)
-                                                     * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
-                                                          2)
-                                                          * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", -100, -98) * framework::dataset::make("max", 71, 74) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToInt8ScaleValidationFixture<Tensor, Accessor, NEGEMMLowpOutputStage>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Invalid min and max
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-                                            TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        -10,
-                                            -200,
-                                            -113,
-                                           })),
-    framework::dataset::make("Max",{        105,
-                                            300,
-                                            -18,
-                                           })),
-    framework::dataset::make("Expected", { true, false, false })),
-    a_info, b_info, output_info, min, max, expected)
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8_SIGNED }),
+        make("reshape_b_only_on_first_run", { false })))
 {
-    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
-    output_stage.type        = GEMMLowpOutputStageType::QUANTIZE_DOWN;
-    output_stage.gemmlowp_min_bound        = min;
-    output_stage.gemmlowp_max_bound        = max;
-    output_stage.output_data_type = DataType::QASYMM8_SIGNED;
-
-    // Lock tensors
-    Status status =  NEGEMMLowpOutputStage::validate(&a_info.clone()->set_is_resizable(false),
-                                                                     &b_info.clone()->set_is_resizable(false),
-                                                                     &output_info.clone()->set_is_resizable(false),
-                                                                     output_stage);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    validate(Accessor(_target), _reference, tolerance_batched);
 }
-// clang-format on
-// *INDENT-ON*
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // BatchedMatMul
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_cases))
+TEST_SUITE(FusedOffsetOutput)
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_quant);
 }
-
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_quant);
 }
+TEST_SUITE_END() // FusedOffsetOutput
 
-TEST_SUITE_END() // BoundedReLu
-
-TEST_SUITE_END() // QASYMM8_SIGNED
-
-TEST_SUITE_END() // QuantizeDownInt32Scale
-
-TEST_SUITE(QuantizeDownInt32ToUint8ScaleByFixedPoint)
-
-const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                    2)
-                                                                    * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                         2)
-                                                                         * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint>;
-
-using NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        0,
-                                            13,
-                                           })),
-    framework::dataset::make("Max",{        205,
-                                            180,
-                                           })),
-    framework::dataset::make("Expected", { true, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
-    // Lock tensors
-    Status status =  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(false),
-                                                                                 &b_info.clone()->set_is_resizable(false),
-                                                                                 &output_info.clone()->set_is_resizable(false),
-                                                                                 min,
-                                                                                 max);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
+// accumulation is not supported for Int8/UInt8 in aarch32
+#ifdef __aarch64__
+TEST_SUITE(ACCUMULATION)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreAccumulateFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreAccumulateFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
+TEST_SUITE_END() // S32
+TEST_SUITE_END() // ACCUMULATION
+#endif // __arch64__
 
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases))
+TEST_SUITE(DynamicQuantization)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // BoundedReLu
-
-TEST_SUITE_END() // QuantizeDownInt32ToUint8ScaleByFixedPoint
-
-TEST_SUITE(QuantizeDownInt32ToInt8ScaleByFixedPoint)
-
-const auto quantize_down_int32_to_int8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                   2)
-                                                                   * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128) * framework::dataset::make("max", 128) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                        2)
-                                                                        * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-        framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::F32), // Invalid input data type
-                                                 TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                                 TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),
-        }),
-        framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                                TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                                TensorInfo(TensorShape(21U), 1, DataType::S32),
-        })),
-        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-                                                TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                                TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-        })),
-        framework::dataset::make("Min",{ -110,
-                                         -113,
-                                         -113,
-        })),
-        framework::dataset::make("Max",{ 87,
-                                         97,
-                                         97,
-        })),
-        framework::dataset::make("Expected", { false, false, true })),
-               a_info, b_info, output_info, min, max, expected)
-{
-    // Lock tensors
-    Status status =  NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(false),
-                                                                                  &b_info.clone()->set_is_resizable(false),
-                                                                                  &output_info.clone()->set_is_resizable(false),
-                                                                                  min,
-                                                                                  max);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int8_scale_by_fixedpoint_cases))
+TEST_SUITE_END() // DynamicQuantization
+
+#ifdef __aarch64__
+// Deqaunt tests involve returning F32 from the MatrixMultiplyCore kernels and is only implemented in aarch64
+TEST_SUITE(Dequant)
+constexpr AbsoluteTolerance<float> tolerance_dequantized(0.01f);
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallGEMMLowpDataset(),
+        make("accumulate", {true, false})
+    ))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_dequantized);
 }
 
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeGEMMLowpDataset(),
+        make("accumulate", {false})
+    ))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QuantizeDownInt32ToInt8ScaleByFixedPoint
-
-TEST_SUITE(QuantizeDownInt32ToInt16ScaleByFixedPoint)
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                    2)
-                                                                    * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                         2)
-                                                                         * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823,
-                                                                                                        1073741825)
-                                                                               * framework::dataset::make("result_shift", -3,
-                                                                                                          -2)
-                                                                               * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600,
-                                                                                                             254601602)
-                                                                                    * framework::dataset::make("result_shift", -3,
-                                                                                                               -1)
-                                                                                    * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QSYMM16),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        -205,
-                                            -180,
-                                           })),
-    framework::dataset::make("Max",{        205,
-                                            180,
-                                           })),
-    framework::dataset::make("Expected", { true, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
-    // Lock tensors
-    Status status =  NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(false),
-                                                                                 &b_info.clone()->set_is_resizable(false),
-                                                                                 &output_info.clone()->set_is_resizable(false),
-                                                                                 min,
-                                                                                 max);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    validate(Accessor(_target), _reference, tolerance_dequantized);
 }
-// clang-format on
-// *INDENT-ON*
+TEST_SUITE_END() // Dequant
+#endif // __aarch64__
 
-TEST_SUITE(NoRelu)
-TEST_SUITE(MultSmallerEq1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultSmallerEq1
-TEST_SUITE(MultGreater1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultGreater1
-TEST_SUITE_END() // NoRelu
-TEST_SUITE(BoundedReLu)
-TEST_SUITE(MultSmallerEq1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultSmallerEq1
-TEST_SUITE(MultGreater1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultGreater1
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QuantizeDownInt32ToInt16ScaleByFixedPoint
-TEST_SUITE_END() // OutputStage
+TEST_SUITE_END() // MatrixMultiplyCore
 TEST_SUITE_END() // GEMMLowp
-TEST_SUITE_END() // Neon
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Gather.cpp b/tests/validation/NEON/Gather.cpp
index ca1e166bd1..0aea19939e 100644
--- a/tests/validation/NEON/Gather.cpp
+++ b/tests/validation/NEON/Gather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,12 +100,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 template <typename T>
 using NEGatherFixture = GatherFixture<Tensor, Accessor, NEGather, T>;
 
+const auto gather_small_shapes = arm_compute::test::framework::dataset::concat(datasets::SmallGatherDataset(), datasets::SmallGatherMultiDimIndicesDataset());
+
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::F16)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -125,7 +127,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::F32)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -146,7 +148,7 @@ TEST_SUITE(U8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::U8)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::U8)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -166,7 +168,7 @@ TEST_SUITE(U16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<uint16_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::U16)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::U16)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp
index 156957a601..ef5e75c5db 100644
--- a/tests/validation/NEON/Im2Col.cpp
+++ b/tests/validation/NEON/Im2Col.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/cpu/kernels/CpuIm2ColKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -57,7 +57,7 @@ const auto conv_args_small         = combine(combine(combine(combine(conv_filter
 TEST_SUITE(NEON)
 TEST_SUITE(Im2Col)
 
-using NEIm2Col = NESynthetizeFunction<NEIm2ColKernel>;
+using CpuIm2Col = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuIm2ColKernel>;
 
 // *INDENT-OFF*
 // clang-format off
@@ -78,26 +78,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Expected", { false, false, false, false, true })),
                input_info, output_info, has_bias, expected)
 {
-    bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias));
+    bool status = bool(cpu::kernels::CpuIm2ColKernel::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
 // *INDENT-ON*
 
 template <typename T>
-using NEIm2ColFixture = Im2ColValidationFixture<Tensor, Accessor, NEIm2Col, T, false>;
+using CpuIm2ColFixture = Im2ColOpValidationFixture<Tensor, Accessor, CpuIm2Col, T, false>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F32)),
-                                                                                                    conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                     conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
-                                                                                                          DataType::F32)),
-                                                                                                  conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                           DataType::F32)),
+                                                                                                   conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -107,15 +107,15 @@ TEST_SUITE_END() // FP32
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F16)),
-                                                                                                   conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                    conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
-                                                                                                         DataType::F16)),
-                                                                                                 conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                          DataType::F16)),
+                                                                                                  conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -127,15 +127,15 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                      conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                       conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()),
-                                                                                                            framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                    conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()),
+                                                                                                             framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                     conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -165,8 +165,8 @@ TEST_CASE(PaddedChannelNHWC, framework::DatasetMode::PRECOMMIT)
     Tensor dst_target = create_tensor<Tensor>(dst_shape, data_type, 1, qinfo);
 
     // Configure target function
-    NEIm2Col im2col_func;
-    im2col_func.configure(&src_target, &dst_target, spatial_kernel, conv_info, has_bias);
+    CpuIm2Col im2col_func;
+    im2col_func.configure(src_target.info(), dst_target.info(), spatial_kernel, conv_info, has_bias);
 
     // Extend padding
     src_target.info()->extend_padding(PaddingSize(3, 5, 9, 1));
@@ -185,8 +185,13 @@ TEST_CASE(PaddedChannelNHWC, framework::DatasetMode::PRECOMMIT)
     // Fill target source
     library->fill_tensor_uniform(Accessor(src_target), 0);
 
+    ITensorPack pack =
+    {
+        { TensorType::ACL_SRC, &src_target },
+        { TensorType::ACL_DST, &dst_target }
+    };
     // Run target function
-    im2col_func.run();
+    im2col_func.run(pack);
 
     // Calculate Reference
     SimpleTensor<float> src_ref{ src_shape, data_type, 1, qinfo, data_layout };
diff --git a/tests/validation/NEON/LSTMLayerQuantized.cpp b/tests/validation/NEON/LSTMLayerQuantized.cpp
index d391267e3e..6b98ee2b67 100644
--- a/tests/validation/NEON/LSTMLayerQuantized.cpp
+++ b/tests/validation/NEON/LSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,11 +64,7 @@ inline void fill_tensor(SimpleTensor<T> &tensor, const std::vector<T> &v)
 }
 
 /** Tolerance for quantized asymmetric operations */
-#if defined(__aarch64__)
-constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(0);
-#else  // defined(__aarch64__)
 constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(1);
-#endif // defined(__aarch64__)
 
 } // namespace
 
diff --git a/tests/validation/NEON/MatMul.cpp b/tests/validation/NEON/MatMul.cpp
new file mode 100644
index 0000000000..f22bd9e86a
--- /dev/null
+++ b/tests/validation/NEON/MatMul.cpp
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/validation/fixtures/MatMulFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using framework::dataset::make;
+
+TEST_SUITE(NEON)
+TEST_SUITE(MatMul)
+
+constexpr AbsoluteTolerance<float> tolerance_fp32(
+    0.001f); /**< Tolerance value for comparing reference's output against implementation's output for FP32 data types */
+const AbsoluteTolerance<half> tolerance_fp16(half(0.1f));
+#ifdef __aarch64__
+constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8(1);
+constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8_signed(1);
+#endif // __aarch64__
+
+// clang-format off
+// *INDENT-OFF*
+// Validation Tests
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL,
+    zip(
+        make("InputAInfo", {
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Mismatching datatype
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::S32),        // Unsupported datatypes
+            TensorInfo(TensorShape(9U, 6U, 2U), 1, DataType::F32),    // Broadcasting in batch dimension not supported
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Invalid shape for multiplication
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32),
+            TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32), // Tensors are not dynamic
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED), // Mismatching data type
+        }),
+        make("InputBInfo", {
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::S32),
+            TensorInfo(TensorShape(5U, 9U, 1U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
+        }),
+        make("OutputInfo", {
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::S32),
+            TensorInfo(TensorShape(5U, 6U, 2U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
+        }),
+        make("TensorIsConst", {false, false, false, false, false , false, true, false, false, false}),
+        make("Expected", { false, false, false, false, true, true, false, true, true, false })),
+    a_info, b_info, output_info, are_tensors_const, expected)
+{
+    TensorInfo a{a_info};
+    TensorInfo b{b_info};
+    a.set_are_values_constant(are_tensors_const);
+    b.set_are_values_constant(are_tensors_const);
+    Status status =  NEMatMul::validate(&a,
+                                        &b,
+                                        &output_info,
+                                        MatMulInfo(),
+                                        CpuMatMulSettings());
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+// *INDENT-ON*
+// clang-format on
+
+// Generic Template
+template <typename T>
+using NEMatMulFixture = MatMulValidationWithActivationFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+// Fast math Template
+template <typename T>
+using NEMatMulFastMathFixture = MatMulGenericValidationFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+template <typename T>
+using NEMatMulFixedFormatFixture = MatMulFixedFormatFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+template <typename T>
+using NEMatMulDynamicTensorsFixture =
+    MatMulValidationWithDynamicTensorsFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+template <typename T>
+using NEQuantizedMatMulFixture = QuantizedMatMulValidationFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEMatMulFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEMatMulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+FIXTURE_DATA_TEST_CASE(RunHighDimensions,
+                       NEMatMulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::HighDimensionalMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunStressDynamicTensors,
+                       NEMatMulDynamicTensorsFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfRuns", 5)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END() // FP32
+
+#ifdef ARM_COMPUTE_ENABLE_BF16
+/* Note : MatMul BF16 is enabled by specifying FP32 datatype and enabling the fast math setting */
+constexpr AbsoluteTolerance<float> tolerance_bf16(0.02f);
+TEST_SUITE(BF16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEMatMulFastMathFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo", {ActivationLayerInfo()}),
+                               make("RunTimes", {0}),
+                               make("Settings", {CpuMatMulSettings().fast_math(true)}),
+                               make("LhsQInfo", {QuantizationInfo()}),
+                               make("RhsQInfo", {QuantizationInfo()}),
+                               make("OutQInfo", {QuantizationInfo()})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_bf16);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+FIXTURE_DATA_TEST_CASE(RunTinyFixedFormat,
+                       NEMatMulFixedFormatFixture<bfloat16>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::TinyMatMulDataset(),
+                               make("TransposeA", {false}),
+                               make("TransposeB", {false}),
+                               make("DataType", DataType::BFLOAT16),
+                               make("ActivationInfo", {ActivationLayerInfo()}),
+                               make("RunTimes", {0}),
+                               make("Settings", {CpuMatMulSettings().fast_math(true).fixed_format(true)}),
+                               make("LhsQInfo", {QuantizationInfo()}),
+                               make("RhsQInfo", {QuantizationInfo()}),
+                               make("OutQInfo", {QuantizationInfo()})))
+{
+    if (CPUInfo::get().has_bf16())
+    {
+        // Validate output
+        validate(Accessor(_target), _reference, tolerance_bf16);
+    }
+}
+#endif /* ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS */
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEMatMulFastMathFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo", {ActivationLayerInfo()}),
+                               make("RunTimes", {0}),
+                               make("Settings", {CpuMatMulSettings().fast_math(true)}),
+                               make("LhsQInfo", {QuantizationInfo()}),
+                               make("RhsQInfo", {QuantizationInfo()}),
+                               make("OutQInfo", {QuantizationInfo()})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_bf16, 0.01 /* tolerance_num */);
+}
+TEST_SUITE_END() // BF16
+#endif           /* ARM_COMPUTE_ENABLE_BF16 */
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEMatMulFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F16),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEMatMulFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F16),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+FIXTURE_DATA_TEST_CASE(RunStressDynamicTensors,
+                       NEMatMulDynamicTensorsFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F16),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfRuns", 5)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE_END() // Float
+
+#ifdef __aarch64__ // All the GeMM CPU assembly kernels for integer datatypes require aarch64
+TEST_SUITE(Quantized)
+
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEQuantizedMatMulFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 30, -1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 2)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallExtraActivation,
+                       NEQuantizedMatMulFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::SmallerMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 30, -1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 2)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEQuantizedMatMulFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 100, 1)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 200, -1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 2)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEQuantizedMatMulFixture<int8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8_SIGNED),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 40, -2)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 1)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallExtraActivation,
+                       NEQuantizedMatMulFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::SmallerMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8_SIGNED),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 40, -2)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 1)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEQuantizedMatMulFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8_SIGNED),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 150, -2)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 250, 1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 1)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
+#endif           // __aarch64__
+
+TEST_SUITE_END() // MatMul
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/MaxUnpoolingLayer.cpp b/tests/validation/NEON/MaxUnpoolingLayer.cpp
index 27f131fa51..0eb021fe71 100644
--- a/tests/validation/NEON/MaxUnpoolingLayer.cpp
+++ b/tests/validation/NEON/MaxUnpoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,10 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
@@ -33,7 +35,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/MaxUnpoolingLayerFixture.h"
-
 namespace arm_compute
 {
 namespace test
@@ -51,7 +52,7 @@ const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(framework::datase
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
                                                                                                                    framework::dataset::make("DataType", DataType::F32))),
                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
 
@@ -63,7 +64,7 @@ FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<float>, framewor
 TEST_SUITE_END() // FP32
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
                                                                                                                   framework::dataset::make("DataType", DataType::F16))),
                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
 
@@ -74,7 +75,37 @@ FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<half>, framework
 }
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
 TEST_SUITE_END() // Float
+
+TEST_SUITE(KernelSelection)
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED
+                                                            })),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuMaxUnpoolingLayerKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_maxunpooling";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // KernelSelection
 TEST_SUITE_END() // PoolingLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
index 20e3bd5325..085f3608a0 100644
--- a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,8 @@ namespace
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<half> tolerance_f16(half(0.2f));
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-RelativeTolerance<float> tolerance_f32(1e-8f);
+RelativeTolerance<float>   tolerance_f32(1e-4f);
+RelativeTolerance<uint8_t> tolerance_qasymm8(1);
 } // namespace
 
 TEST_SUITE(NEON)
@@ -81,7 +82,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
                        framework::dataset::make("DataType", DataType::F16)),
                        framework::dataset::make("InPlace", { false, true })),
-                       framework::dataset::make("Epsilon", { 1e-8 })))
+                       framework::dataset::make("Epsilon", { 1e-3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -101,7 +102,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
                        framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("InPlace", { false, true })),
-                       framework::dataset::make("Epsilon", { 1e-8 })))
+                       framework::dataset::make("Epsilon", { 1e-7 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -114,9 +115,23 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEMeanStdDevNormalizationLayerFixture<float>, f
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InPlace", { false, true })),
+                       framework::dataset::make("Epsilon", { 1e-7 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // QASYMM8
+
 TEST_SUITE_END() // MeanStdNormalizationLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index 1bb0588919..964d1c5deb 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,6 +56,11 @@ const auto PixelWiseMultiplicationQASYMM8QuantDataset = combine(combine(
                                                                     framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })),
                                                                 framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 32768.f, 0) }));
 
+const auto PixelWiseMultiplicationQASYMM8QuantInPlaceDataset = combine(combine(
+                                                                           framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 32768.f, 10) }),
+                                                                           framework::dataset::make("Src1QInfo", { QuantizationInfo(5.f / 32768.f, 10) })),
+                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 10) }));
+
 const auto PixelWiseMultiplicationPolicySTNUDataset = combine(
                                                           framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE }),
                                                           framework::dataset::make("RoundingPolicy", { RoundingPolicy::TO_NEAREST_UP }));
@@ -75,7 +80,8 @@ const auto PixelWiseMultiplicationPolicySTZDataset = combine(
  * expected to have either different quantization information, data type
  * or different shape we are not testing in-place computation.
  */
-const auto InPlaceDataSet = framework::dataset::make("InPlace", { false, true });
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 
 #define DEFAULT_VALIDATE validate(Accessor(_target), _reference);
 #define VALIDATE(TYPE, TOLERANCE) validate(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f);
@@ -275,7 +281,19 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture,
                                                                                                                      framework::dataset::make("Scale", { scale_unity })),
                                                                                                                      PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                      PixelWiseMultiplicationQASYMM8QuantDataset),
-                                                                                                                     InPlaceDataSet))
+                                                                                                                     OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallInPlace, NEPixelWiseMultiplicationQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("DataTypeOut", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("Scale", { scale_unity })),
+                       PixelWiseMultiplicationPolicySTZDataset),
+                       PixelWiseMultiplicationQASYMM8QuantInPlaceDataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -292,7 +310,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_255 })),
                                                                                                                        PixelWiseMultiplicationPolicySTNUDataset),
                                                                                                                        PixelWiseMultiplicationQASYMM8QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -306,7 +324,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_unity })),
                                                                                                                        PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                        PixelWiseMultiplicationQASYMM8QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -320,7 +338,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_other })),
                                                                                                                        PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                        PixelWiseMultiplicationQASYMM8QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -335,7 +353,20 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationBroadcastQASYMM8Fixtur
                                                        framework::dataset::make("Scale", { scale_other })),
                                                PixelWiseMultiplicationPolicySTZDataset),
                                        PixelWiseMultiplicationQASYMM8QuantDataset),
-                               framework::dataset::make("InPlace", { false })))
+                               OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyInPlace, NEPixelWiseMultiplicationBroadcastQASYMM8Fixture, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                                               framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                               framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                       framework::dataset::make("Scale", { scale_other })),
+                                               PixelWiseMultiplicationPolicySTZDataset),
+                                       PixelWiseMultiplicationQASYMM8QuantInPlaceDataset),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -351,7 +382,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_255 })),
                                                                                                                        PixelWiseMultiplicationPolicySTNUDataset),
                                                                                                                        PixelWiseMultiplicationQSYMM16QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -365,7 +396,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_unity })),
                                                                                                                        PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                        PixelWiseMultiplicationQSYMM16QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -379,7 +410,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_other })),
                                                                                                                        PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                        PixelWiseMultiplicationQSYMM16QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -394,7 +425,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, f
                                                                                                                     framework::dataset::make("Scale", { scale_unity })),
                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                     PixelWiseMultiplicationQSYMM16QuantDataset),
-                                                                                                                    framework::dataset::make("InPlace", { false })))
+                                                                                                                    OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -411,7 +442,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationU8U8ToS16Fixture, fram
                                                                                                                        framework::dataset::make("Scale", { scale_255 })),
                                                                                                                        datasets::ConvertPolicies()),
                                                                                                                        framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_UP)),
-                                                                                                                       framework::dataset::make("InPlace", { false })))
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<int16_t>(1), 0.f);
@@ -451,17 +482,17 @@ TEST_SUITE_END() // U8toU8
 TEST_SUITE(U8toS16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_255, TO_NEAREST_UP, framework::dataset::make("InPlace", { false }),
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_255, TO_NEAREST_UP, OutOfPlaceDataSet,
                                                  WRAP_VALIDATE(int16_t, 2))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_unity, TO_ZERO, framework::dataset::make("InPlace", { false }),
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_unity, TO_ZERO, OutOfPlaceDataSet,
                                                  DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_other, TO_ZERO, framework::dataset::make("InPlace", { false }),
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_other, TO_ZERO, OutOfPlaceDataSet,
                                                  DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
diff --git a/tests/validation/NEON/Pooling3dLayer.cpp b/tests/validation/NEON/Pooling3dLayer.cpp
new file mode 100644
index 0000000000..07054462f5
--- /dev/null
+++ b/tests/validation/NEON/Pooling3dLayer.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/Pooling3dLayerDataset.h"
+#include "tests/datasets/PoolingTypesDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/Pooling3dLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/** Input data sets for floating-point data types */
+const auto Pooling3dLayerDatasetFP = combine(combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { Size3D(2, 3, 2) })),
+                                                             framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(2, 2, 1) })),
+                                                     framework::dataset::make("Padding", { Padding3D(0, 1, 0), Padding3D(1, 1, 1) })),
+                                             framework::dataset::make("ExcludePadding", { true, false }));
+
+const auto Pooling3dLayerDatasetFPSmall = combine(combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { Size3D(2, 2, 2), Size3D(3, 3, 3) })),
+                                                                  framework::dataset::make("Stride", { Size3D(2, 2, 2), Size3D(2, 1, 1) })),
+                                                          framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                  framework::dataset::make("ExcludePadding", { true, false }));
+
+const auto Pooling3dLayerDatasetQASYMM8Small = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                                               framework::dataset::make("PoolingSize", { Size3D(3, 3, 3) })),
+                                                                       framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(2, 2, 1) })),
+                                                               framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                       framework::dataset::make("ExcludePadding", { true }));
+
+const auto Pooling3dLayerDatasetQASYMM8Large = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                                               framework::dataset::make("PoolingSize", { Size3D(3, 3, 3) })),
+                                                                       framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 2, 1) })),
+                                                               framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 0) })),
+                                                       framework::dataset::make("ExcludePadding", { true }));
+
+using ShapeDataset = framework::dataset::ContainerDataset<std::vector<TensorShape>>;
+
+constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);     /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+#endif                                                       /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);   /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_s(1); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric type */
+
+const auto qasymm8_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, 10) });
+const auto qasymm8_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, 10), // Same qinfo
+    QuantizationInfo(.1f, 5),  // Multiplier <= 1
+    QuantizationInfo(2.f, 3)   // Multiplier > 1
+});
+
+const auto qasymm8_signed_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, -10) });
+const auto qasymm8_signed_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, -10), // Same qinfo
+    QuantizationInfo(.1f, -5),  // Multiplier <= 1
+    QuantizationInfo(2.f, -3)   // Multiplier > 1
+});
+
+} //namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(Pooling3dLayer)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Mismatching data type
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid pad/size combination
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid pad/size combination
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid output shape
+                                            TensorInfo(TensorShape(5U, 13U, 15U, 2U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Global Pooling
+                                            TensorInfo(TensorShape(13U,13U, 5U, 1U, 2U),  1, DataType::F32, DataLayout::NDHWC),     // Invalid output Global Pooling
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid data type
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NHWC),      // Invalid data layout
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 5U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(1U, 16U,  1U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                          }),
+    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(2U, 25U, 11U, 3U, 3U), 1, DataType::F16, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 30U, 11U, 3U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 25U, 16U, 3U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 3U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U,  1U,  1U, 1U, 3U), 1, DataType::F32, DataLayout::NDHWC),            // Global pooling applied
+                                            TensorInfo(TensorShape(5U,  2U,  2U, 2U, 2U), 1, DataType::F32, DataLayout::NDHWC),            // Invalid output Global Pooling
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::QASYMM8, DataLayout::NDHWC),        // Invalid data type
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),            // Invalid data layout
+                                            TensorInfo(TensorShape(5U,  1U,  1U, 1U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(1U, 15U, 1U, 2U, 4U), 1, DataType::F32, DataLayout::NDHWC),             // size larger than height
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 2U),  1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                    })),
+    framework::dataset::make("PoolInfo",  { Pooling3dLayerInfo(PoolingType::AVG, 3, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1, 1, 1), Padding3D(2, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::L2,  3, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG),
+                                            Pooling3dLayerInfo(PoolingType::MAX),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1U, 1U, 1U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1U, 1U, 1U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG),
+                                            Pooling3dLayerInfo(PoolingType::MAX, 2, Size3D(1, 1, 2), Padding3D(0, 0, 0), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(2U, 2U, 2U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 1, Size3D(2U, 2U, 2U), Padding3D(2, 2, 2), true),  // pool size is equal to the padding size
+                                            Pooling3dLayerInfo(PoolingType::AVG, 1, Size3D(2U, 2U, 2U), Padding3D(2, 2, 2), false), // pool size is equal to the padding size
+                                            Pooling3dLayerInfo(PoolingType::AVG, 3, Size3D(2U, 2U, 2U), Padding3D(2,1,2,2,1,2), false, false, DimensionRoundingType::CEIL), // CEIL with asymmetric Padding
+                                            })),
+    framework::dataset::make("Expected", { false, false, false, false, true, false, false, false, false, true , false, true, false, false, false})),
+    input_info, output_info, pool_info, expected)
+{
+    bool is_valid = bool(NEPooling3dLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using NEPoolingLayer3dFixture = Pooling3dLayerValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+template <typename T>
+using NESpecial3dPoolingLayerFixture = SpecialPooling3dLayerValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+template <typename T>
+using NEPooling3dLayerGlobalFixture = Pooling3dLayerGlobalValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunSpecial, NESpecial3dPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::Pooling3dLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small5dShapes(), combine(Pooling3dLayerDatasetFPSmall,
+                                                                                                            framework::dataset::make("DataType", DataType::F32))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFPSmall, framework::dataset::make("DataType", DataType::F32))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(3U, 27U, 13U, 4U),
+                                                                             TensorShape(4U, 27U, 13U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(27, 13, 4) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunGlobalSmall, NEPooling3dLayerGlobalFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 4U, 3U),
+                                                                             TensorShape(27U, 13U, 4U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(4U, 79U, 37U, 11U),
+                                                                             TensorShape(4U, 79U, 37U, 11U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(79, 37, 11) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small5x5Shapes(), combine(Pooling3dLayerDatasetFPSmall,
+                                                                                                           framework::dataset::make("DataType", DataType::F16))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFP,
+                                                                                                           framework::dataset::make("DataType",
+                                                                                                                   DataType::F16))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(3U, 27U, 13U, 4U),
+                                                                             TensorShape(4U, 27U, 13U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(27, 13, 4) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunSmallGlobal, NEPooling3dLayerGlobalFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 4U, 3U),
+                                                                             TensorShape(27U, 13U, 4U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(4U, 79U, 37U, 11U),
+                                                                             TensorShape(4U, 79U, 37U, 11U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(79, 37, 11) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+TEST_SUITE_END() // Float
+TEST_SUITE(Quantized)
+
+template <typename T>
+using NEPooling3dLayerQuantizedFixture = Pooling3dLayerValidationQuantizedFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                       combine(Pooling3dLayerDatasetQASYMM8Small,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       qasymm8_in_qinfo_dataset),
+                                                                                                                       qasymm8_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large5dShapes(),
+                                                                                                                       combine(Pooling3dLayerDatasetQASYMM8Large,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       qasymm8_in_qinfo_dataset),
+                                                                                                                       qasymm8_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPooling3dLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                      combine(Pooling3dLayerDatasetQASYMM8Small,
+                                                                                                                              framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                      qasymm8_signed_in_qinfo_dataset),
+                                                                                                                      qasymm8_signed_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // Pooling3dLayer
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 24e552ed0c..161fe627cc 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/PoolingLayerDataset.h"
@@ -81,6 +80,14 @@ const auto qasymm8_signed_out_qinfo_dataset = framework::dataset::make("OutputQu
     QuantizationInfo(.1f, -5),  // Multiplier <= 1
     QuantizationInfo(2.f, -3)   // Multiplier > 1
 });
+
+// Cases where pooling region is completely outside the input tensor (excluding global pooling)
+const auto pool_outside_input_dataset = zip(zip(zip(zip(
+                                                        framework::dataset::make("Shape", { TensorShape{ 2U, 2U, 1U }, TensorShape{ 2U, 2U, 4U }, TensorShape{ 3U, 5U, 2U }, TensorShape{ 10U, 20U, 3U } }),
+                                                        framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                                    framework::dataset::make("PoolingSize", { Size2D{ 2, 2 }, Size2D{ 3, 3 }, Size2D{ 2, 2 }, Size2D{ 3, 6 } })),
+                                                framework::dataset::make("PadStride", { PadStrideInfo{ 1, 1, 2, 2 }, PadStrideInfo{ 1, 1, 4, 4 }, PadStrideInfo{ 1, 1, 3, 3 }, PadStrideInfo{ 1, 1, 2, 5 } })),
+                                            framework::dataset::make("ExcludePadding", { false, false, false, false }));
 } // namespace
 
 TEST_SUITE(NEON)
@@ -97,7 +104,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),     // Invalid output Global Pooling
                                             TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::QASYMM8), // Invalid exclude_padding = false with quantized type, no actual padding and NHWC
                                             TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),
-                                          }),
+                                            TensorInfo(TensorShape(1U, 16U, 1U),  1, DataType::F32),
+                                            TensorInfo(TensorShape(112, 112, 64,1), 1, DataType::F32, DataLayout::NHWC), // Mismatching number of channels
+                                            TensorInfo(TensorShape(112, 112, 64,1), 1, DataType::F32, DataLayout::NHWC), // Mismatching width
+                                         }),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(25U, 10U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(30U, 11U, 2U), 1, DataType::F32),
@@ -106,7 +116,11 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             TensorInfo(TensorShape(2U, 2U, 5U), 1, DataType::F32),
                                             TensorInfo(TensorShape(12U, 12U, 5U), 1, DataType::QASYMM8),
                                             TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                          })),
+                                            TensorInfo(TensorShape(1U, 15U, 1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(56, 56, 64,1), 1, DataType::F32, DataLayout::NHWC),
+                                            TensorInfo(TensorShape(56, 51, 64,1), 1, DataType::F32, DataLayout::NHWC),
+
+                                           })),
     framework::dataset::make("PoolInfo",  { PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
                                             PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
                                             PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NCHW, PadStrideInfo(1, 1, 2, 0)),
@@ -115,8 +129,12 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             PoolingLayerInfo(PoolingType::MAX, DataLayout::NCHW),
                                             PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NHWC, PadStrideInfo(), false),
                                             PoolingLayerInfo(PoolingType::AVG, DataLayout::NCHW),
+                                            PoolingLayerInfo(PoolingType::MAX, 2, DataLayout::NHWC, PadStrideInfo(1, 1, 0, 0), false),
+                                            PoolingLayerInfo(PoolingType::MAX,3,DataLayout::NHWC,PadStrideInfo(2,2,1,1)),
+                                            PoolingLayerInfo(PoolingType::MAX,3,DataLayout::NHWC,PadStrideInfo(2,2,1,1)),
+
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, true, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, false, true, false, true, false, false, false, false})),
     input_info, output_info, pool_info, expected)
 {
     bool is_valid = bool(NEPoolingLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info));
@@ -130,6 +148,8 @@ using NEPoolingLayerIndicesFixture = PoolingLayerIndicesValidationFixture<Tensor
 
 template <typename T>
 using NEPoolingLayerFixture = PoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T>;
+template <typename T>
+using NEPoolingLayerMixedDataLayoutFixture = PoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T, true>;
 
 template <typename T>
 using NESpecialPoolingLayerFixture = SpecialPoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T>;
@@ -137,27 +157,37 @@ using NESpecialPoolingLayerFixture = SpecialPoolingLayerValidationFixture<Tensor
 const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
                                                                framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0) })),
                                                        framework::dataset::make("ExcludePadding", { true, false }));
-
+const auto PoolingLayerKernelIndicesDatasetFPSmall = combine(combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(7, 7) })),
+                                                                     framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 1, 1, 1) })),
+                                                             framework::dataset::make("ExcludePadding", { false }));
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
-                                                                                                                   framework::dataset::make("DataType",
-                                                                                                                           DataType::F32))),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
-
-                                                                                                                  ))
+FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                   combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                           framework::dataset::make("DataType", DataType::F32))),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                   framework::dataset::make("UseKernelIndices", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+    validate(Accessor(_target_indices), _ref_indices);
+}
+FIXTURE_DATA_TEST_CASE(RunKernelIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                   combine(PoolingLayerKernelIndicesDatasetFPSmall,
+                                                                                                                           framework::dataset::make("DataType", DataType::F32))),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                   framework::dataset::make("UseKernelIndices", { true })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
     validate(Accessor(_target_indices), _ref_indices);
 }
-
 FIXTURE_DATA_TEST_CASE(RunSpecial, NESpecialPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::PoolingLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerDatasetFPSmall,
                                                                                                                   framework::dataset::make("DataType",
                                                                                                                           DataType::F32))),
                                                                                                           pool_data_layout_dataset))
@@ -165,6 +195,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEPoolingLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(datasets::PoolingTypes(),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(2, 1, 0, 0) })),
+                                       framework::dataset::make("ExcludePadding", { false })),
+                               framework::dataset::make("DataType", DataType::F32))),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
                                                                                                                 framework::dataset::make("DataType",
                                                                                                                         DataType::F32))),
@@ -173,11 +214,32 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE(CornerCases)
+FIXTURE_DATA_TEST_CASE(PoolRegionCompletelyOutsideInput, NEPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(pool_outside_input_dataset,
+                       framework::dataset::make("DataType",
+                                                DataType::F32)),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // CornerCases
 TEST_SUITE_END() // FP32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                  combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                          framework::dataset::make("DataType",
+                                                                                                                                  DataType::F16))),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                  framework::dataset::make("UseKernelIndices", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+    validate(Accessor(_target_indices), _ref_indices);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerDatasetFPSmall,
                                                                                                                  framework::dataset::make("DataType", DataType::F16))),
                                                                                                          pool_data_layout_dataset))
 {
@@ -191,6 +253,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<half>, framework::Dataset
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
+TEST_SUITE(CornerCases)
+FIXTURE_DATA_TEST_CASE(PoolRegionCompletelyOutsideInput, NEPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(pool_outside_input_dataset,
+                       framework::dataset::make("DataType",
+                                                DataType::F16)),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // CornerCases
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // Float
@@ -199,9 +271,11 @@ TEST_SUITE(Quantized)
 
 template <typename T>
 using NEPoolingLayerQuantizedFixture = PoolingLayerValidationQuantizedFixture<Tensor, Accessor, NEPoolingLayer, T>;
+template <typename T>
+using NEPoolingLayerQuantizedMixedDataLayoutFixture = PoolingLayerValidationQuantizedFixture<Tensor, Accessor, NEPoolingLayer, T, true>;
 
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmallNCHW, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmallNCHW, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
                        combine(PoolingLayerDatasetQASYMM8Small,
                                framework::dataset::make("DataType", DataType::QASYMM8))),
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })),
@@ -211,7 +285,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallNCHW, NEPoolingLayerQuantizedFixture<uint8_t>, fr
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
                                                                                                                      combine(PoolingLayerDatasetQASYMM8Small,
                                                                                                                              framework::dataset::make("DataType", DataType::QASYMM8))),
                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NHWC })),
@@ -221,24 +295,40 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<uint8_t>, framew
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END() // QASYMM8
-TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmallNCHW, NEPoolingLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                        combine(PoolingLayerDatasetQASYMM8Small,
-                                                                                                                                framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
-                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                                                                                                        qasymm8_signed_in_qinfo_dataset),
-                                                                                                                        qasymm8_signed_in_qinfo_dataset))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEPoolingLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
+                                       framework::dataset::make("ExcludePadding", { true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 255.f, 10) })),
+                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 255.f, 5) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
                                                                                                                     combine(PoolingLayerDatasetQASYMM8Small,
                                                                                                                             framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                     qasymm8_signed_in_qinfo_dataset),
-                                                                                                                    qasymm8_signed_out_qinfo_dataset))
+                                                                                                                    qasymm8_signed_in_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEPoolingLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
+                                       framework::dataset::make("ExcludePadding", { true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })),
+                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_s);
diff --git a/tests/validation/NEON/QLSTMLayerNormalization.cpp b/tests/validation/NEON/QLSTMLayerNormalization.cpp
index 617f64ce1d..9738213114 100644
--- a/tests/validation/NEON/QLSTMLayerNormalization.cpp
+++ b/tests/validation/NEON/QLSTMLayerNormalization.cpp
@@ -167,7 +167,7 @@ TEST_SUITE(Quantized)
 TEST_SUITE(QSYMM16)
 
 /** Tests will be targetting
- * - Comparison between Neon kernel and the exact same but scalar version of reference kernel
+ * - Comparison between optimized kernel and the exact same but scalar version of reference kernel
  * - Input shapes of 1D and 2D with the first dimension covers boundary values of 128-bit vector size (0~3 iterations)
  * - Weight and bias 1D shape that have same size as that of input shapes
  * - Quantization scale is greater and smaller than one.
@@ -179,7 +179,7 @@ TEST_SUITE(QSYMM16)
  * - The algorithm has been sensitive to quantization scale but it is hard to fully test
  *   the sensitivity due to aforementioned reason.
  * - Again, it is hard to fully test corner values due to the exact same algorithm of the
- *   reference kernel and the Neon kernel.
+ *   reference kernel and the optimized kernel.
  */
 
 constexpr uint32_t qsymm16_per_vector = vector_size_byte / sizeof(int16_t);
diff --git a/tests/validation/NEON/QuantizationLayer.cpp b/tests/validation/NEON/QuantizationLayer.cpp
index aeee54c835..bab7490762 100644
--- a/tests/validation/NEON/QuantizationLayer.cpp
+++ b/tests/validation/NEON/QuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/QuantizationLayerFixture.h"
 
+
 namespace arm_compute
 {
 namespace test
@@ -182,7 +183,16 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8, NEQuantizationLayerQASYMM8GenFixture<uin
                        framework::dataset::make("DataType", DataType::QASYMM8)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(0.5f, 10) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, 15) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, 15), QuantizationInfo(0.5f, 25) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_u8);
+}
+FIXTURE_DATA_TEST_CASE(ConvertUint8toInt8, NEQuantizationLayerQASYMM8GenFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(QuantizationSmallShapes,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("DataTypeOut", { DataType::QASYMM8_SIGNED })),
+                       framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(2.0f, -1) })),
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, 127) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
@@ -191,7 +201,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8_SIGNED, NEQuantizationLayerQASYMM8_SIGNED
                        framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8_SIGNED })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.0f, 10), QuantizationInfo(2.0f, -25) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 15) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 15), QuantizationInfo(1.0f, 127) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_s8);
@@ -211,7 +221,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8_SIGNED, NEQuantizationLayerQASYMM8_SIGNED
                        framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8_SIGNED })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.0f, 10) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, -5) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, -5), QuantizationInfo(1.0f, 43) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_s8);
@@ -220,11 +230,21 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8, NEQuantizationLayerQASYMM8GenFixture<int
                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(2.0f, 10), QuantizationInfo(2.0f, -25) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 30) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 30), QuantizationInfo(2.0f, -128) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
 }
+FIXTURE_DATA_TEST_CASE(ConvertInt8toUint8, NEQuantizationLayerQASYMM8_SIGNEDGenFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(QuantizationSmallShapes,
+                       framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
+                       framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.0f, 0) })),
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, -128) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_s8);
+}
+
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 
diff --git a/tests/validation/NEON/RNNLayer.cpp b/tests/validation/NEON/RNNLayer.cpp
index 14d9a5d14e..979aa0f2c5 100644
--- a/tests/validation/NEON/RNNLayer.cpp
+++ b/tests/validation/NEON/RNNLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -139,7 +139,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NERNNLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallRNNLayerDataset(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+    validate(Accessor(_target), _reference, tolerance_f16, 0.02f, abs_tolerance_f16);
 }
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/tests/validation/NEON/ROIAlignLayer.cpp b/tests/validation/NEON/ROIAlignLayer.cpp
index 7bfdddf10d..98c92a0b20 100644
--- a/tests/validation/NEON/ROIAlignLayer.cpp
+++ b/tests/validation/NEON/ROIAlignLayer.cpp
@@ -53,6 +53,7 @@ AbsoluteTolerance<float> absolute_tolerance_f16(0.001f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+constexpr AbsoluteTolerance<int8_t> tolerance_qasymm8_s(1);
 } // namespace
 
 TEST_SUITE(NEON)
@@ -154,7 +155,7 @@ FIXTURE_DATA_TEST_CASE(Small, NEROIAlignLayerQuantizedFixture<int8_t>, framework
                                framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(2.f / 255.f, 120) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
diff --git a/tests/validation/NEON/ROIPoolingLayer.cpp b/tests/validation/NEON/ROIPoolingLayer.cpp
new file mode 100644
index 0000000000..8b5147e57f
--- /dev/null
+++ b/tests/validation/NEON/ROIPoolingLayer.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/Globals.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ROIDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ROIPoolingLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> relative_tolerance_f32(0.01f);
+AbsoluteTolerance<float> absolute_tolerance_f32(0.001f);
+
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+} // end namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(RoiPooling)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Successful test
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::QASYMM8), // Successful test (quantized)
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Incorrect rois type
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching data type input/output
+                                                       TensorInfo(TensorShape(250U, 128U, 2U), 1, DataType::F32), // Mismatching depth size input/output
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching number of rois and output batch size
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Invalid number of values per ROIS
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching height and width input/output
+
+                                                     }),
+               framework::dataset::make("RoisInfo", { TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::F16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 10U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(4, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                    })),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(5U, 5U, 3U, 4U), 1, DataType::F32),
+                                                     })),
+               framework::dataset::make("PoolInfo", { ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      })),
+               framework::dataset::make("Expected", { true, true, false, false, false, false, false })),
+               input_info, rois_info, output_info, pool_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(NEROIPoolingLayer::validate(&input_info.clone()->set_is_resizable(true), &rois_info.clone()->set_is_resizable(true), &output_info.clone()->set_is_resizable(true), pool_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using NEROIPoolingLayerFloatFixture = ROIPoolingLayerFixture<Tensor, Accessor, NEROIPoolingLayer, float>;
+
+TEST_SUITE(Float)
+FIXTURE_DATA_TEST_CASE(SmallROIPoolingLayerFloat, NEROIPoolingLayerFloatFixture, framework::DatasetMode::ALL,
+                       framework::dataset::combine(framework::dataset::combine(datasets::SmallROIDataset(),
+                                                                               framework::dataset::make("DataType", { DataType::F32 })),
+                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, relative_tolerance_f32, .02f, absolute_tolerance_f32);
+}
+
+TEST_SUITE_END() // Float test suite end
+
+// Begin quantized tests
+TEST_SUITE(Quantized)
+template <typename T>
+using NEROIPoolingLayerQuantizedFixture = ROIPoolingLayerQuantizedFixture<Tensor, Accessor, NEROIPoolingLayer, T>;
+
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(Small, NEROIPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(datasets::SmallROIDataset(),
+                                                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 127) })),
+                               framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(2.f / 255.f, 120) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // end qasymm8 tests
+TEST_SUITE_END() // end quantized tests
+
+TEST_SUITE_END() // RoiPooling
+TEST_SUITE_END() // NEON
+
+} // validation end
+} // test namespace end
+} // arm_compute namespace end
diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp
index b4a3f0d399..8ca0bb53a7 100644
--- a/tests/validation/NEON/ReduceMean.cpp
+++ b/tests/validation/NEON/ReduceMean.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,10 +46,15 @@ constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value f
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 constexpr AbsoluteTolerance<float> tolerance_f16(0.03f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
 #endif                                                   // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef __aarch64__
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);    /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_s8(1);    /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#else // __aarch64__
+constexpr AbsoluteTolerance<uint8_t> tolerance_u8(2);    /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */
 constexpr AbsoluteTolerance<int8_t>  tolerance_s8(2);    /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#endif // __aarch64__
 
-const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(0, 1, 2, 3) }),
+const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(2, 3), Coordinates(0, 1, 2, 3) }),
                                framework::dataset::make("KeepDims", { true }));
 const auto axis_drop = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1), Coordinates(3) }), framework::dataset::make("KeepDims", { false }));
 } // namespace
diff --git a/tests/validation/NEON/Remap.cpp b/tests/validation/NEON/Remap.cpp
deleted file mode 100644
index 3c02f8eece..0000000000
--- a/tests/validation/NEON/Remap.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/RemapFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(0);
-constexpr float                      tolerance_number = 0.f;
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Remap)
-
-template <typename T>
-using NERemapFixture = RemapValidationFixture<Tensor, Accessor, NERemap, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NERemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NERemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/ReorderLayer.cpp b/tests/validation/NEON/ReorderLayer.cpp
new file mode 100644
index 0000000000..839ad0ac92
--- /dev/null
+++ b/tests/validation/NEON/ReorderLayer.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__)
+
+#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ReorderLayerDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ReorderFixture.h"
+#include "src/core/NEON/kernels/NEReorderKernel.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using framework::dataset::make;
+
+TEST_SUITE(NEON)
+TEST_SUITE(ReorderLayer)
+
+template <typename T>
+using NEReorderLayerAlias = ReorderValidationFixture<Tensor, Accessor, NEReorderLayer, T>;
+
+TEST_SUITE(FP32)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+DATA_TEST_CASE(ValidateReorderOHWIo8, framework::DatasetMode::ALL, combine(
+                                                                    zip(
+                                                                     make("InShape",{ TensorShape(10U, 9U), TensorShape(234U, 301U) }),
+                                                                     make("OutShape", { TensorShape(10U, 16U), TensorShape(234U, 304U) })
+                                                                    ),
+                                                                    zip(
+                                                                        make("InputWeightFormat", {WeightFormat::OHWI}),
+                                                                        make("OutputWeightFormat", {WeightFormat::OHWIo8})
+                                                                    )),
+            input_shape, output_shape,  input_wf,  output_wf)
+{
+    if(Scheduler::get().cpu_info().has_sve()){
+        arm_compute::NEReorderLayer reorder_layer;
+        int vector_length = arm_gemm::utils::get_vector_length<float>();
+        bool expected_bool_status = false;
+        if (vector_length == 8)
+        {
+            expected_bool_status = true;
+        }
+
+        TensorInfo input_tensor_info(input_shape, 1, DataType::F32);
+        TensorInfo output_tensor_info(output_shape, 1, DataType::F32);
+
+        Status status = reorder_layer.validate(&input_tensor_info, &output_tensor_info, input_wf, output_wf);
+
+        ARM_COMPUTE_EXPECT((expected_bool_status == bool(status)), framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), make("DataType", DataType::F32)))
+{
+    // Validate output
+    if (_hardware_supports)
+    {
+        validate(Accessor(_target), _reference);
+    }
+}
+#endif // ARM_COMPUTE_ENABLE_SVE
+
+FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+TEST_SUITE_END() // FP32
+
+TEST_SUITE_END() // ReorderLayer
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif  // defined(__aarch64__)
diff --git a/tests/validation/NEON/ReshapeLayer.cpp b/tests/validation/NEON/ReshapeLayer.cpp
index bf39c399a5..e9f114d491 100644
--- a/tests/validation/NEON/ReshapeLayer.cpp
+++ b/tests/validation/NEON/ReshapeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,9 @@ input_info, output_info, expected)
 template <typename T>
 using NEReshapeLayerFixture = ReshapeLayerValidationFixture<Tensor, Accessor, NEReshapeLayer, T>;
 
+template <typename T>
+using NEReshapeLayerPaddedFixture = ReshapeLayerPaddedValidationFixture<Tensor, Accessor, NEReshapeLayer, T>;
+
 TEST_SUITE(Float)
 TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
@@ -84,8 +87,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //F32
+TEST_SUITE_END() //Float
 
 TEST_SUITE(Integer)
 TEST_SUITE(S8)
@@ -94,7 +97,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int8_t>, framework::Datas
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() //S8
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S16)))
@@ -102,11 +105,41 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int16_t>, framework::Data
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //S16
+TEST_SUITE_END() //Integer
+
+TEST_SUITE(Padded)
+TEST_SUITE(Float)
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S32
+TEST_SUITE_END() //Float
+
+TEST_SUITE(Integer)
+TEST_SUITE(S8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<int8_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S8
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S16
+TEST_SUITE_END() //Integer
+TEST_SUITE_END() //Padded
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //ReshapeLayer
+TEST_SUITE_END() //NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Reverse.cpp b/tests/validation/NEON/Reverse.cpp
index 3dc3eeee80..7b5337f14b 100644
--- a/tests/validation/NEON/Reverse.cpp
+++ b/tests/validation/NEON/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,8 @@ namespace validation
 {
 namespace
 {
-auto run_small_dataset = combine(datasets::SmallShapes(), datasets::Tiny1DShapes());
+using framework::dataset::make;
+auto run_small_dataset = combine(datasets::Small3DShapes(), datasets::Tiny1DShapes());
 auto run_large_dataset = combine(datasets::LargeShapes(), datasets::Tiny1DShapes());
 
 } // namespace
@@ -53,28 +54,31 @@ TEST_SUITE(Reverse)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
+        make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis shape
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis length (> 4)
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Mismatching shapes
+                                            TensorInfo(TensorShape(32U, 13U, 17U, 3U, 2U), 1, DataType::U8), // Unsupported source dimensions (>4)
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         }),
-        framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
+        make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U, 13U, 2U), 1, DataType::U8),
+                                            TensorInfo(TensorShape(32U, 13U, 17U, 3U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         })),
-        framework::dataset::make("AxisInfo", { TensorInfo(TensorShape(3U), 1, DataType::U8),
+        make("AxisInfo", { TensorInfo(TensorShape(3U), 1, DataType::U8),
                                            TensorInfo(TensorShape(2U, 10U), 1, DataType::U32),
                                            TensorInfo(TensorShape(8U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
+                                           TensorInfo(TensorShape(2U), 1, DataType::U32),
         })),
-        framework::dataset::make("Expected", { false, false, false, false, true, true})),
+        make("Expected", { false, false, false, false, false, true, true})),
         src_info, dst_info, axis_info, expected)
 {
     Status s = NEReverse::validate(&src_info.clone()->set_is_resizable(false),
@@ -95,7 +99,11 @@ TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -104,7 +112,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<half>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -116,7 +128,11 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -125,7 +141,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<float>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -138,7 +158,11 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -147,7 +171,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index bb1ab936d1..f1209a21ac 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,16 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ScaleValidationDataset.h"
-#include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
-#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ScaleFixture.h"
 
@@ -50,8 +44,8 @@ using datasets::ScaleSamplingPolicySet;
 using datasets::ScaleAlignCornersSamplingPolicySet;
 
 /** We consider vector size in byte 64 since the maximum size of
- * a vector used by @ref NEScaleKernel is currently 64-byte (float32x4x4).
- * There are possibility to reduce test time further by using
+ * a vector used by the kernel is currently 64-byte (float32x4x4).
+ * There is possibility to reduce test time further by using
  * smaller vector sizes for different data types where applicable.
  */
 constexpr uint32_t vector_byte = 64;
@@ -62,25 +56,31 @@ constexpr uint32_t num_elements_per_vector()
     return vector_byte / sizeof(T);
 }
 
-/** Scale data types */
-const auto ScaleDataTypes = framework::dataset::make("DataType",
+/** Quantization information data set */
+const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
 {
-    DataType::U8,
-    DataType::S16,
-    DataType::F32,
+    QuantizationInfo(0.5f, -10),
 });
 
 /** Quantization information data set */
-const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
+const auto InputQuantizationInfoSet = framework::dataset::make("InputQuantizationInfo",
 {
     QuantizationInfo(0.5f, -10),
 });
 
+/** Quantization information data set */
+const auto OutputQuantizationInfoSet = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(0.2f, 20),
+});
+
 /** Tolerance */
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
+constexpr AbsoluteTolerance<int8_t>  tolerance_s8(1);
 constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
 RelativeTolerance<float>             tolerance_f32(0.05);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr float         abs_tolerance_f16(0.01f);
 RelativeTolerance<half> tolerance_f16(half(0.1));
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
@@ -94,9 +94,8 @@ TEST_SUITE(Validate)
 
 /** Validate test suite is to test ARM_COMPUTE_RETURN_ON_* macros
  * we use to check the validity of given arguments in @ref NEScale
- * and subsequent call to @ref NEScaleKernel.
  * Since this is using validate() of @ref NEScale, which pre-adjust
- * arguments for @ref NEScaleKernel, the following conditions in
+ * arguments for the kernel, the following conditions in
  * the kernel are not currently tested.
  * - The same input and output
  * - Data type of offset, dx and dy
@@ -156,8 +155,6 @@ TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
         { DataType::BFLOAT16, false },
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         { DataType::F16, true },
-#else  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { DataType::F16, false },
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         { DataType::F32, true },
         { DataType::F64, false },
@@ -317,11 +314,18 @@ DATA_TEST_CASE(CheckNoPaddingInterpAREA, framework::DatasetMode::ALL, combine(co
 template <typename T>
 using NEScaleFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T>;
 template <typename T>
+using NEScaleMixedDataLayoutFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T, true>;
+template <typename T>
 using NEScaleQuantizedFixture = ScaleValidationQuantizedFixture<Tensor, Accessor, NEScale, T>;
+template <typename T>
+using NEScaleDifferentOutputQuantizedFixture = ScaleValidationDifferentOutputQuantizedFixture<Tensor, Accessor, NEScale, T>;
+template <typename T>
+using NEScaleQuantizedMixedDataLayoutFixture = ScaleValidationQuantizedFixture<Tensor, Accessor, NEScale, T, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-const auto f32_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
+const auto f32_shape      = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
+const auto f32_shape_nhwc = combine(datasets::Small3DShapes(), framework::dataset::make("DataType", DataType::F32));
 FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
@@ -331,6 +335,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEScaleMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_DATASET(f32_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
 FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
@@ -340,10 +353,38 @@ FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<float>, framework::D
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunMediumNHWC, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumMixedDataLayoutNHWC, NEScaleMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumAlignCornersNHWC, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
 TEST_SUITE_END() // FP32
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-const auto f16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
+const auto f16_shape      = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
+const auto f16_shape_nhwc = combine(datasets::Small3DShapes(), framework::dataset::make("DataType", DataType::F16));
 FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
@@ -351,7 +392,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::A
     const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(Accessor(_target), _reference, valid_region, tolerance_f16);
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleAlignCornersSamplingPolicySet))
 {
@@ -360,7 +401,34 @@ FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<half>, framework::Da
     const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(Accessor(_target), _reference, valid_region, tolerance_f16);
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumNHWC, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumMixedDataLayoutNHWC, NEScaleMixedDataLayoutFixture<half>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumAlignCornersNHWC, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
 }
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -388,6 +456,27 @@ FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<uint8_t>, framework:
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
 TEST_SUITE_END() // U8
+TEST_SUITE(S8)
+const auto s8_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int8_t>())), framework::dataset::make("DataType", DataType::S8));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_S8_DATASET(s8_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_s8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_S8_DATASET(s8_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_s8);
+}
+TEST_SUITE_END() // S8
 TEST_SUITE(S16)
 const auto s16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int16_t>())), framework::dataset::make("DataType", DataType::S16));
 FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape, ScaleSamplingPolicySet))
@@ -423,6 +512,26 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<uint8_t>, framework::Da
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallDifferentOutputQuantization, NEScaleDifferentOutputQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet, InputQuantizationInfoSet, OutputQuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEScaleQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet,
+                       QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
 FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleAlignCornersSamplingPolicySet,
                        QuantizationInfoSet))
 {
@@ -446,6 +555,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<int8_t>, framework::Dat
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallDifferentOutputQuantization, NEScaleDifferentOutputQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+                       ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleSamplingPolicySet, InputQuantizationInfoSet, OutputQuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
+}
 FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleAlignCornersSamplingPolicySet,
                        QuantizationInfoSet))
 {
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index 2a9e30604e..94d0866c38 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,21 +25,22 @@
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuSoftmaxKernel.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/SoftmaxLayerFixture.h"
-
 namespace arm_compute
 {
 namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -51,7 +52,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -62,56 +63,55 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 
 TEST_SUITE(NEON)
 TEST_SUITE(SoftmaxLayer)
-
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                      }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                     })),
-               framework::dataset::make("beta", { 1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                })),
-               framework::dataset::make("axis", { 0,
-                                                  0,
-                                                  0,
-                                                  1,
-                                                  0,
-                                                  -1,
-                                                  2,
-                                                  -3,
-                                                })),
-               framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })),
-               input_info, output_info, beta, axis, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
+                                    QuantizationInfo(1.f/256, 12)),
+                        }),
+    make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+                        TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        }),
+    make("beta", { 1.0,
+                   2.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                }),
+    make("axis", { 0,
+                   0,
+                   0,
+                   1,
+                   0,
+                   -1,
+                   2,
+                   -3,
+                }),
+    make("Expected", { false, false, false, true, true, true, false, false })),
+    input_info, output_info, beta, axis, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
 }
@@ -121,29 +121,80 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 template <typename T>
 using NESoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NESoftmaxLayer, T>;
 
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+    concat(
+        combine(
+            make("CpuExt", std::string("neon")),
+            make("DataType", { DataType::F32,
+                            DataType::F16,
+                            DataType::QASYMM8,
+                            DataType::QASYMM8_SIGNED})
+        ),
+        combine(
+            make("CpuExt", std::string("sme2")),
+            make("DataType", { DataType::F32,
+                            DataType::F16}))
+        ),
+        cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "neon");
+    cpu_isa.sme2 = (cpu_ext == "sme2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuSoftmaxKernel::get_implementation(
+        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */, 0 /* axis */, CPUInfo::get().get_sme2_vector_length()},
+        cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = cpu_ext + "_" + cpu_impl_dt(data_type) + "_softmax";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                         framework::dataset::make("Axis", { 0, 1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 0, 2, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f }),
+        make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
-                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -152,26 +203,30 @@ TEST_SUITE_END() //FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0, -2, 3 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::Small4DShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -2, 3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -184,29 +239,40 @@ using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<Tensor,
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0, 1, -2 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })),
+        make("Axis", { 0, 1, -2 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                   combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                   framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.0f })
+        ),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -214,20 +280,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framew
 TEST_SUITE_END() //QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0, 1, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, 1, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
diff --git a/tests/validation/NEON/StackLayer.cpp b/tests/validation/NEON/StackLayer.cpp
index d88f713ccd..3828010c7b 100644
--- a/tests/validation/NEON/StackLayer.cpp
+++ b/tests/validation/NEON/StackLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,69 +44,74 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
 namespace
 {
 // *INDENT-OFF*
 // clang-format off
 /** Data types */
-const auto data_types = framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
+const auto data_types = make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
 
 /** Num tensors values to test */
-const auto n_values = framework::dataset::make("NumTensors", { 3, 4 });
+const auto n_values = make("NumTensors", { 3, 4 });
 
 /** Shapes 1D to test */
-const auto shapes_1d_small = combine(datasets::Small1DShapes(), framework::dataset::make("Axis", -1, 2));
+const auto shapes_1d_small = combine(datasets::Small1DShapes(), make("Axis", -1, 2));
 
 /** Shapes 2D to test */
-const auto shapes_2d_small = combine(datasets::Small2DShapes(), framework::dataset::make("Axis", -2, 3));
+const auto shapes_2d_small = combine(datasets::Small2DShapes(), make("Axis", -2, 3));
 
 /** Shapes 3D to test */
-const auto shapes_3d_small = combine(datasets::Small3DShapes(), framework::dataset::make("Axis", -3, 4));
+const auto shapes_3d_small = combine(datasets::Small3DShapes(), make("Axis", -3, 4));
 
 /** Shapes 4D to test */
-const auto shapes_4d_small = combine(datasets::Small4DShapes(), framework::dataset::make("Axis", -4, 5));
+const auto shapes_4d_small = combine(datasets::Small4DShapes(), make("Axis", -4, 5));
 
 /** Shapes 1D to test */
-const auto shapes_1d_large = combine(datasets::Large1DShapes(), framework::dataset::make("Axis", -1, 2));
+const auto shapes_1d_large = combine(datasets::Large1DShapes(), make("Axis", -1, 2));
 
 /** Shapes 2D to test */
-const auto shapes_2d_large = combine(datasets::Medium2DShapes(), framework::dataset::make("Axis", -2, 3));
+const auto shapes_2d_large = combine(datasets::Medium2DShapes(), make("Axis", -2, 3));
 
 /** Shapes 3D to test */
-const auto shapes_3d_large = combine(datasets::Medium3DShapes(), framework::dataset::make("Axis", -3, 4));
+const auto shapes_3d_large = combine(datasets::Medium3DShapes(), make("Axis", -3, 4));
 
 /** Shapes 4D to test */
-const auto shapes_4d_large = combine(datasets::Medium4DShapes(), framework::dataset::make("Axis", -4, 5));
+const auto shapes_4d_large = combine(datasets::Medium4DShapes(), make("Axis", -4, 5));
 } // namespace
 
 /** Fixture to use */
 template<typename T>
 using NEStackLayerFixture = StackLayerValidationFixture<Tensor, ITensor, Accessor, NEStackLayer, T>;
 
+template<typename T>
+using NEStackLayerWithPaddingFixture = StackLayerWithPaddingValidationFixture<Tensor, ITensor, Accessor, NEStackLayer, T>;
+
 using namespace arm_compute::misc::shape_calculator;
 
 TEST_SUITE(NEON)
 TEST_SUITE(StackLayer)
 
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                                      framework::dataset::make("InputInfo",
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+make("InputInfo",
 {
     std::vector<TensorInfo>{ TensorInfo(TensorShape(9U, 8U), 1, DataType::U8) },
-    std::vector<TensorInfo>{ TensorInfo(TensorShape(1U, 2U), 1, DataType::U8) , TensorInfo(TensorShape(1U, 2U), 1, DataType::U8), TensorInfo(TensorShape(1U, 2U), 1, DataType::U8)}, 
+    std::vector<TensorInfo>{ TensorInfo(TensorShape(1U, 2U), 1, DataType::U8) , TensorInfo(TensorShape(1U, 2U), 1, DataType::U8), TensorInfo(TensorShape(1U, 2U), 1, DataType::U8)},
     std::vector<TensorInfo>{ TensorInfo(TensorShape(2U, 3U), 1, DataType::S32) },
-    std::vector<TensorInfo>{ TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32), TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32)}, 
+    std::vector<TensorInfo>{ TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32), TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32)},
     std::vector<TensorInfo>{ TensorInfo(TensorShape(9U, 8U), 1, DataType::S32) },
 }),
-framework::dataset::make("OutputInfo",
+make("OutputInfo",
 {
     TensorInfo(TensorShape(1U, 9U, 8U), 1, DataType::U8),   // Passes, stack 1 tensor on x axis
     TensorInfo(TensorShape(1U, 3U, 2U), 1, DataType::U8),   // Passes, stack 3 tensors on y axis
     TensorInfo(TensorShape(1U, 2U, 3U), 1, DataType::S32),  // fails axis <  (- input's rank)
     TensorInfo(TensorShape(3U, 7U, 5U), 1, DataType::S32),  // fails, input dimensions > 4
     TensorInfo(TensorShape(1U, 2U, 3U), 1, DataType::U8),   // fails mismatching data types
-})),
-framework::dataset::make("Axis", { -3, 1, -4, -3, 1 })),
-framework::dataset::make("Expected", { true, true, false, false, false })),
+}),
+make("Axis", { -3, 1, -4, -3, 1 }),
+make("Expected", { true, true, false, false, false })),
 input_info, output_info, axis, expected)
 {
     std::vector<TensorInfo>    ti(input_info);
@@ -121,18 +126,18 @@ input_info, output_info, axis, expected)
 TEST_SUITE(Shapes1D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -141,18 +146,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -161,18 +166,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -183,18 +188,18 @@ TEST_SUITE_END() // Shapes1D
 TEST_SUITE(Shapes2D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -203,18 +208,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -223,18 +228,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -245,18 +250,18 @@ TEST_SUITE_END() // Shapes2D
 TEST_SUITE(Shapes3D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -265,18 +270,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -285,18 +290,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -307,18 +312,29 @@ TEST_SUITE_END() // Shapes3D
 TEST_SUITE(Shapes4D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+// Testing the case with padding for only 4d shapes and for one data type. This is because the underlying code
+// path depends only on the padding, which isn't affected by the shapes or data types.
+FIXTURE_DATA_TEST_CASE(RunSmallWithPadding, NEStackLayerWithPaddingFixture<int>, framework::DatasetMode::ALL,
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -327,18 +343,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -347,24 +363,37 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes4D
+
+TEST_SUITE(HighDimensional)
+// The Cpu implementation supports tensors of size 4D+, but reference implementation does not.
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, NEStackLayerFixture<char>, framework::DatasetMode::DISABLED,
+    combine(make("Shape", { TensorShape{2U, 3U, 4U, 5U, 3U} }),
+            make("Axis", { 5, 0, -3, 2 }),
+            make("DataType", { DataType::S8 }),
+            make("NumTensors", { 3 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // HighDimensional
 TEST_SUITE_END() // StackLayer
 TEST_SUITE_END() // Neon
 } // namespace validation
diff --git a/tests/validation/NEON/UNIT/MemoryManager.cpp b/tests/validation/NEON/UNIT/MemoryManager.cpp
index 83a9fcb332..2c57b534fe 100644
--- a/tests/validation/NEON/UNIT/MemoryManager.cpp
+++ b/tests/validation/NEON/UNIT/MemoryManager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,15 +62,15 @@ TEST_CASE(BlobMemoryManagerSimpleWithinFunctionLevel, framework::DatasetMode::AL
     norm_layer_1.configure(&src, &dst, NormalizationLayerInfo(NormType::CROSS_MAP, 3));
     norm_layer_2.configure(&src, &dst, NormalizationLayerInfo(NormType::IN_MAP_1D, 3));
 
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
     // Allocate tensors
     src.allocator()->allocate();
     dst.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
     // Finalize memory manager
     mm->populate(allocator, 1 /* num_pools */);
diff --git a/tests/validation/NEON/UNIT/RuntimeContext.cpp b/tests/validation/NEON/UNIT/RuntimeContext.cpp
index f64d380423..e0d45c639a 100644
--- a/tests/validation/NEON/UNIT/RuntimeContext.cpp
+++ b/tests/validation/NEON/UNIT/RuntimeContext.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,6 +48,19 @@ namespace validation
 {
 TEST_SUITE(NEON)
 TEST_SUITE(UNIT)
+#if defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+TEST_CASE(CpuCapacity, framework::DatasetMode::ALL)
+{
+    CPUInfo& ci =  arm_compute::Scheduler::get().cpu_info();
+    const uint32_t nonlittle_num_cpus = ci.get_cpu_num_excluding_little();
+    const uint32_t num_threads = arm_compute::Scheduler::get().num_threads();
+
+    ARM_COMPUTE_EXPECT(num_threads<=nonlittle_num_cpus , framework::LogLevel::ERRORS);
+}
+#endif /* defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+
 TEST_SUITE(RuntimeContext)
 
 TEST_CASE(Scheduler, framework::DatasetMode::ALL)
@@ -57,14 +70,14 @@ TEST_CASE(Scheduler, framework::DatasetMode::ALL)
     RuntimeContext ctx;
 
     // Check if it's been initialised properly
-    ARM_COMPUTE_EXPECT(ctx.scheduler() != nullptr, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(ctx.asset_manager() == nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(ctx.scheduler() != nullptr);
+    ARM_COMPUTE_ASSERT(ctx.asset_manager() == nullptr);
 
     // Create a Scheduler
     auto scheduler = SchedulerFactory::create();
     ctx.set_scheduler(scheduler.get());
     // Check if the scheduler has been properly setup
-    ARM_COMPUTE_EXPECT(ctx.scheduler() != nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(ctx.scheduler() != nullptr);
 
     // Create a new activation function
     NEActivationLayer act_layer(&ctx);
@@ -74,14 +87,14 @@ TEST_CASE(Scheduler, framework::DatasetMode::ALL)
 
     act_layer.configure(&src, &dst, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR));
 
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
     // Allocate tensors
     src.allocator()->allocate();
     dst.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
 
     float min_bound = 0;
     float max_bound = 0;
@@ -117,10 +130,10 @@ TEST_CASE(MultipleThreadedScheduller, framework::DatasetMode::ALL)
     act_layer_thread0.configure(&src_t0, &dst_t0, activation_info);
     act_layer_thread1.configure(&src_t1, &dst_t1, activation_info);
 
-    ARM_COMPUTE_EXPECT(src_t0.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_t0.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src_t1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(src_t0.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst_t0.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(src_t1.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst_t1.info()->is_resizable());
 
     // Allocate tensors
     src_t0.allocator()->allocate();
@@ -128,8 +141,8 @@ TEST_CASE(MultipleThreadedScheduller, framework::DatasetMode::ALL)
     src_t1.allocator()->allocate();
     dst_t1.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(!src_t0.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!src_t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!src_t0.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(!src_t1.info()->is_resizable());
 
     float min_bound = 0;
     float max_bound = 0;
diff --git a/tests/validation/NEON/UNIT/TensorAllocator.cpp b/tests/validation/NEON/UNIT/TensorAllocator.cpp
index ef19524d1c..0aab9ef9b5 100644
--- a/tests/validation/NEON/UNIT/TensorAllocator.cpp
+++ b/tests/validation/NEON/UNIT/TensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,32 +61,32 @@ TEST_CASE(ImportMemory, framework::DatasetMode::ALL)
     // Negative case : Import nullptr
     Tensor t1;
     t1.allocator()->init(info);
-    ARM_COMPUTE_EXPECT(!bool(t1.allocator()->import_memory(nullptr)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t1.allocator()->import_memory(nullptr)));
+    ARM_COMPUTE_ASSERT(t1.info()->is_resizable());
 
     // Negative case : Import misaligned pointer
     Tensor       t2;
     const size_t required_alignment = 339;
     t2.allocator()->init(info, required_alignment);
-    ARM_COMPUTE_EXPECT(!bool(t2.allocator()->import_memory(data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t2.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t2.allocator()->import_memory(data.get())));
+    ARM_COMPUTE_ASSERT(t2.info()->is_resizable());
 
     // Negative case : Import memory to a tensor that is memory managed
     Tensor      t3;
     MemoryGroup mg;
     t3.allocator()->set_associated_memory_group(&mg);
-    ARM_COMPUTE_EXPECT(!bool(t3.allocator()->import_memory(data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t3.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t3.allocator()->import_memory(data.get())));
+    ARM_COMPUTE_ASSERT(t3.info()->is_resizable());
 
     // Positive case : Set raw pointer
     Tensor t4;
     t4.allocator()->init(info);
-    ARM_COMPUTE_EXPECT(bool(t4.allocator()->import_memory(data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!t4.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t4.buffer() == reinterpret_cast<uint8_t *>(data.get()), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(t4.allocator()->import_memory(data.get())));
+    ARM_COMPUTE_ASSERT(!t4.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(t4.buffer() == reinterpret_cast<uint8_t *>(data.get()));
     t4.allocator()->free();
-    ARM_COMPUTE_EXPECT(t4.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t4.buffer() == nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(t4.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(t4.buffer() == nullptr);
 }
 
 TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
@@ -114,8 +114,8 @@ TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
     void *aligned_ptr = raw_data.get();
     std::align(required_alignment, total_size_in_bytes, aligned_ptr, space);
 
-    ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(aligned_ptr)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(aligned_ptr)));
+    ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
     // Fill tensor
     std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -137,7 +137,7 @@ TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
 
     // Release resources
     tensor.allocator()->free();
-    ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
 }
 
 TEST_CASE(ImportMemoryMallocPadded, framework::DatasetMode::ALL)
@@ -160,8 +160,8 @@ TEST_CASE(ImportMemoryMallocPadded, framework::DatasetMode::ALL)
     const size_t total_size_in_bytes = tensor.info()->total_size();
     auto         raw_data            = std::make_unique<uint8_t[]>(total_size_in_bytes);
 
-    ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(raw_data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(raw_data.get())));
+    ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
     // Fill tensor while accounting padding
     std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -190,10 +190,10 @@ TEST_CASE(ImportMemoryMallocPadded, framework::DatasetMode::ALL)
 
     // Release resources
     tensor.allocator()->free();
-    ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
 }
 
-#if !defined(BARE_METAL)
+#if !defined(_WIN64) && !defined(BARE_METAL)
 TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 {
     const ActivationLayerInfo act_info(ActivationLayerInfo::ActivationFunction::RELU);
@@ -221,12 +221,12 @@ TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 
     // Map file
     utils::mmap_io::MMappedFile mmapped_file("test_mmap_import.bin", 0 /** Whole file */, 0);
-    ARM_COMPUTE_EXPECT(mmapped_file.is_mapped(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(mmapped_file.is_mapped());
     unsigned char *data = mmapped_file.data();
 
     // Import memory mapped memory
-    ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(data)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(data)));
+    ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
     // Fill tensor
     std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -248,9 +248,9 @@ TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 
     // Release resources
     tensor.allocator()->free();
-    ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
 }
-#endif // !defined(BARE_METAL)
+#endif // !defined(_WIN64) && !defined(BARE_METAL)
 
 TEST_CASE(AlignedAlloc, framework::DatasetMode::ALL)
 {
@@ -262,7 +262,7 @@ TEST_CASE(AlignedAlloc, framework::DatasetMode::ALL)
     t.allocator()->init(info, requested_alignment);
     t.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(t.buffer() != nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(t.buffer() != nullptr);
     ARM_COMPUTE_EXPECT(t.allocator()->alignment() == requested_alignment, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(arm_compute::utility::check_aligned(reinterpret_cast<void *>(t.buffer()), requested_alignment),
                        framework::LogLevel::ERRORS);
diff --git a/tests/validation/UNIT/CPPScheduler.cpp b/tests/validation/UNIT/CPPScheduler.cpp
new file mode 100644
index 0000000000..6a3f6819fc
--- /dev/null
+++ b/tests/validation/UNIT/CPPScheduler.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+
+#include <stdexcept>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+
+namespace
+{
+class TestException: public std::exception
+{
+public:
+    const char* what() const noexcept override
+    {
+        return "Expected test exception";
+    }
+};
+
+class TestKernel: public ICPPKernel
+{
+public:
+    TestKernel()
+    {
+        Window window;
+        window.set(0, Window::Dimension(0, 2));
+        configure(window);
+    }
+
+    const char* name() const override
+    {
+        return "TestKernel";
+    }
+
+    void run(const Window &, const ThreadInfo &) override
+    {
+        throw TestException();
+    }
+
+};
+}
+
+TEST_SUITE(UNIT)
+TEST_SUITE(CPPScheduler)
+#if defined(ARM_COMPUTE_CPP_SCHEDULER) && !defined(BARE_METAL)
+TEST_CASE(RethrowException, framework::DatasetMode::ALL)
+{
+    CPPScheduler scheduler;
+    CPPScheduler::Hints hints(0);
+    TestKernel kernel;
+
+    scheduler.set_num_threads(2);
+    try
+    {
+        scheduler.schedule(&kernel, hints);
+    }
+    catch(const TestException&)
+    {
+        return;
+    }
+    ARM_COMPUTE_EXPECT_FAIL("Expected exception not caught", framework::LogLevel::ERRORS);
+}
+#endif // defined(ARM_COMPUTE_CPP_SCHEDULER) &&  !defined(BARE_METAL)
+TEST_SUITE_END()
+TEST_SUITE_END()
diff --git a/tests/validation/UNIT/GPUTarget.cpp b/tests/validation/UNIT/GPUTarget.cpp
index e1b7e1fe3f..2e64635b7a 100644
--- a/tests/validation/UNIT/GPUTarget.cpp
+++ b/tests/validation/UNIT/GPUTarget.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/GPUTarget.h"
-#include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/Utils.h"
 #include "tests/framework/Asserts.h"
@@ -38,6 +37,7 @@ TEST_SUITE(GPUTarget)
 
 TEST_CASE(GetGPUTargetFromName, framework::DatasetMode::ALL)
 {
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-T000") == GPUTarget::MIDGARD, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-T600") == GPUTarget::T600, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-T700") == GPUTarget::T700, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-T800") == GPUTarget::T800, framework::LogLevel::ERRORS);
@@ -46,15 +46,24 @@ TEST_CASE(GetGPUTargetFromName, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G51") == GPUTarget::G51, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G51BIG") == GPUTarget::G51BIG, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G51LIT") == GPUTarget::G51LIT, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G52") == GPUTarget::G52, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G52LIT") == GPUTarget::G52LIT, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G31") == GPUTarget::G31, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G76") == GPUTarget::G76, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G76 r0p0") == GPUTarget::G76, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G52") == GPUTarget::G52, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G52LIT") == GPUTarget::G52LIT, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G77") == GPUTarget::G77, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G57") == GPUTarget::G57, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G78") == GPUTarget::G78, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G78AE") == GPUTarget::G78, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-TODX") == GPUTarget::TODX, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-T000") == GPUTarget::MIDGARD, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G68") == GPUTarget::G68, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G78AE") == GPUTarget::G78AE, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G710") == GPUTarget::G710, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G610") == GPUTarget::G610, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G510") == GPUTarget::G510, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G310") == GPUTarget::G310, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G715") == GPUTarget::G715, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G615") == GPUTarget::G615, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G720") == GPUTarget::G720, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G620") == GPUTarget::G620, framework::LogLevel::ERRORS);
 }
 
 TEST_CASE(GPUTargetIsIn, framework::DatasetMode::ALL)
diff --git a/tests/validation/UNIT/SafeIntegerOps.cpp b/tests/validation/UNIT/SafeIntegerOps.cpp
index 62f70414f1..13e4ef5125 100644
--- a/tests/validation/UNIT/SafeIntegerOps.cpp
+++ b/tests/validation/UNIT/SafeIntegerOps.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/utils/math/SafeOps.h"
-#include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/Utils.h"
 #include "tests/framework/Asserts.h"
diff --git a/tests/validation/UNIT/SubTensorInfo.cpp b/tests/validation/UNIT/SubTensorInfo.cpp
index 5a930620ce..ca5e46550c 100644
--- a/tests/validation/UNIT/SubTensorInfo.cpp
+++ b/tests/validation/UNIT/SubTensorInfo.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,6 +69,7 @@ TEST_CASE(SubTensorCreation, framework::DatasetMode::ALL)
  *  - A) Extend padding when SubTensor XY does not match parent tensor should fail
  *    B) Extend with zero padding when SubTensor XY does not match parent tensor should succeed
  *  - C) Extend padding when SubTensor XY matches parent tensor should succeed
+ *  - D) Set lock padding to true, so that extend padding would fail
  */
 TEST_CASE(SubTensorPaddingExpansion, framework::DatasetMode::ALL)
 {
@@ -95,6 +96,14 @@ TEST_CASE(SubTensorPaddingExpansion, framework::DatasetMode::ALL)
         ARM_COMPUTE_EXPECT(tensor_info.padding().top == 2, framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(tensor_info.padding().right == 1, framework::LogLevel::ERRORS);
     }
+
+    // Test D
+    {
+        TensorInfo    tensor_info(TensorShape(23U, 17U, 3U), 1, DataType::F32);
+        SubTensorInfo sub_tensor_info(&tensor_info, TensorShape(4U, 3U, 1U), Coordinates(5, 2, 1));
+        sub_tensor_info.set_lock_paddings(true);
+        ARM_COMPUTE_EXPECT_THROW(sub_tensor_info.extend_padding(PaddingSize(2, 1)), framework::LogLevel::ERRORS);
+    }
 }
 
 TEST_SUITE_END() // SubTensorInfo
diff --git a/tests/validation/UNIT/TensorInfo.cpp b/tests/validation/UNIT/TensorInfo.cpp
index cf9dfeabe9..b79c1e9253 100644
--- a/tests/validation/UNIT/TensorInfo.cpp
+++ b/tests/validation/UNIT/TensorInfo.cpp
@@ -93,7 +93,7 @@ TEST_CASE(Clone, framework::DatasetMode::ALL)
 
     // Get clone of current tensor info
     std::unique_ptr<ITensorInfo> info_clone = info.clone();
-    ARM_COMPUTE_EXPECT(info_clone != nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(info_clone != nullptr);
     ARM_COMPUTE_EXPECT(info_clone->total_size() == info.total_size(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(info_clone->num_channels() == info.num_channels(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(info_clone->data_type() == info.data_type(), framework::LogLevel::ERRORS);
@@ -184,8 +184,17 @@ TEST_CASE(SymmPerChannelQuantizationInfo, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(info.quantization_info().offset().empty(), framework::LogLevel::ERRORS);
 }
 
-TEST_SUITE_END() // TensorInfoValidation
-TEST_SUITE_END()
+/** Validates lock paddings flag*/
+TEST_CASE(SubTensorPaddingExpansion, framework::DatasetMode::ALL)
+{
+    TensorInfo    tensor_info(TensorShape(23U, 17U, 3U), 1, DataType::F32);
+    tensor_info.set_lock_paddings(true);
+
+    // Now lock padding is set to true, therefore the extend padding would fail
+    ARM_COMPUTE_EXPECT_THROW(tensor_info.extend_padding(PaddingSize(2, 1)), framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // TensorInfo
+TEST_SUITE_END() // UNIT
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h
index d356f05b70..289aca4d08 100644
--- a/tests/validation/Validation.h
+++ b/tests/validation/Validation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,17 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+// Compare if 2 values are both infinities and if they are "equal" (has the same sign)
+template <typename T>
+inline bool are_equal_infs(T val0, T val1)
+{
+    const auto same_sign = support::cpp11::signbit(val0) == support::cpp11::signbit(val1);
+    return (!support::cpp11::isfinite(val0)) && (!support::cpp11::isfinite(val1)) && same_sign;
+}
+} // namespace
+
 /** Class reprensenting an absolute tolerance value. */
 template <typename T>
 class AbsoluteTolerance
@@ -140,7 +151,7 @@ bool compare_dimensions(const Dimensions<T> &dimensions1, const Dimensions<T> &d
 {
     ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
-    if(data_layout == DataLayout::NCHW)
+    if(data_layout != DataLayout::NHWC)
     {
         if(dimensions1.num_dimensions() != dimensions2.num_dimensions())
         {
@@ -157,24 +168,22 @@ bool compare_dimensions(const Dimensions<T> &dimensions1, const Dimensions<T> &d
     }
     else
     {
-        // In case a 2D shape becomes 3D after permutation, the permuted tensor will have one dimension more and the first value will be 1
-        if((dimensions1.num_dimensions() != dimensions2.num_dimensions()) && ((dimensions1.num_dimensions() != (dimensions2.num_dimensions() + 1)) || (dimensions1.x() != 1)))
+        // In case a 1D/2D shape becomes 3D after permutation, the permuted tensor will have two/one dimension(s) more and the first (two) value(s) will be 1
+        // clang-format off
+        const auto max_dims = std::max(dimensions1.num_dimensions(), dimensions2.num_dimensions());
+        for(unsigned int i = 3; i < max_dims; ++i)
         {
-            return false;
+            if(dimensions1[i] != dimensions2[i])
+            {
+                return false;
+            }
         }
+        // clang-format on
 
         if((dimensions1[0] != dimensions2[2]) || (dimensions1[1] != dimensions2[0]) || (dimensions1[2] != dimensions2[1]))
         {
             return false;
         }
-
-        for(unsigned int i = 3; i < dimensions1.num_dimensions(); ++i)
-        {
-            if(dimensions1[i] != dimensions2[i])
-            {
-                return false;
-            }
-        }
     }
 
     return true;
@@ -267,16 +276,6 @@ void validate(std::vector<unsigned int> classified_labels, std::vector<unsigned
 template <typename T, typename U = AbsoluteTolerance<T>>
 bool validate(T target, T reference, U tolerance = AbsoluteTolerance<T>());
 
-/** Validate key points. */
-template <typename T, typename U, typename V = AbsoluteTolerance<float>>
-void validate_keypoints(T target_first, T target_last, U reference_first, U reference_last, V tolerance = AbsoluteTolerance<float>(),
-                        float allowed_missing_percentage = 5.f, float allowed_mismatch_percentage = 5.f);
-
-/** Validate detection windows. */
-template <typename T, typename U, typename V = AbsoluteTolerance<float>>
-void validate_detection_windows(T target_first, T target_last, U reference_first, U reference_last, V tolerance = AbsoluteTolerance<float>(),
-                                float allowed_missing_percentage = 5.f, float allowed_mismatch_percentage = 5.f);
-
 template <typename T>
 struct compare_base
 {
@@ -308,9 +307,9 @@ struct compare<AbsoluteTolerance<U>> : public compare_base<AbsoluteTolerance<U>>
     /** Perform comparison */
     operator bool() const
     {
-        if(!support::cpp11::isfinite(this->_target) || !support::cpp11::isfinite(this->_reference))
+        if(are_equal_infs(this->_target, this->_reference))
         {
-            return false;
+            return true;
         }
         else if(this->_target == this->_reference)
         {
@@ -334,9 +333,9 @@ struct compare<RelativeTolerance<U>> : public compare_base<RelativeTolerance<U>>
     /** Perform comparison */
     operator bool() const
     {
-        if(!support::cpp11::isfinite(this->_target) || !support::cpp11::isfinite(this->_reference))
+        if(are_equal_infs(this->_target, this->_reference))
         {
-            return false;
+            return true;
         }
         else if(this->_target == this->_reference)
         {
@@ -506,9 +505,9 @@ void validate_wrap(const IAccessor &tensor, const SimpleTensor<T> &reference, co
                 // check for wrapping
                 if(!equal)
                 {
-                    if(!support::cpp11::isfinite(target_value) || !support::cpp11::isfinite(reference_value))
+                    if(are_equal_infs(target_value, reference_value))
                     {
-                        equal = false;
+                        equal = true;
                     }
                     else
                     {
@@ -687,210 +686,6 @@ void validate_min_max_loc(const MinMaxLocationValues<T> &target, const MinMaxLoc
         ARM_COMPUTE_EXPECT(same_coords != reference.max_loc.end(), framework::LogLevel::ERRORS);
     }
 }
-
-/** Check which keypoints from [first1, last1) are missing in [first2, last2) */
-template <typename T, typename U, typename V>
-std::pair<int64_t, int64_t> compare_keypoints(T first1, T last1, U first2, U last2, V tolerance, bool check_mismatches = true)
-{
-    /* Keypoint (x,y) should have similar strength (within tolerance) and other properties in both reference and target */
-    const auto compare_props_eq = [&](const KeyPoint & lhs, const KeyPoint & rhs)
-    {
-        return compare<V>(lhs.strength, rhs.strength, tolerance)
-               && lhs.tracking_status == rhs.tracking_status
-               && lhs.scale == rhs.scale
-               && lhs.orientation == rhs.orientation
-               && lhs.error == rhs.error;
-    };
-
-    /* Used to sort KeyPoints by coordinates (x, y) */
-    const auto compare_coords_lt = [](const KeyPoint & lhs, const KeyPoint & rhs)
-    {
-        return std::tie(lhs.x, lhs.y) < std::tie(rhs.x, rhs.y);
-    };
-
-    std::sort(first1, last1, compare_coords_lt);
-    std::sort(first2, last2, compare_coords_lt);
-
-    if(check_mismatches)
-    {
-        ARM_COMPUTE_TEST_INFO("Checking for mismatches: ref count = " << std::distance(first1, last1) << " target count = " << std::distance(first2, last2));
-    }
-
-    int64_t num_missing    = 0;
-    int64_t num_mismatches = 0;
-    bool    rest_missing   = false;
-
-    while(first1 != last1)
-    {
-        if(first2 == last2)
-        {
-            rest_missing = true;
-            break;
-        }
-
-        if(compare_coords_lt(*first1, *first2))
-        {
-            ++num_missing;
-            ARM_COMPUTE_TEST_INFO("Key point not found");
-            ARM_COMPUTE_TEST_INFO("keypoint1 = " << *first1++);
-            framework::ARM_COMPUTE_PRINT_INFO();
-        }
-        else
-        {
-            if(!compare_coords_lt(*first2, *first1)) // Equal coordinates
-            {
-                if(check_mismatches && !compare_props_eq(*first1, *first2)) // Check other properties
-                {
-                    ++num_mismatches;
-                    ARM_COMPUTE_TEST_INFO("Mismatching keypoint");
-                    ARM_COMPUTE_TEST_INFO("keypoint1 [ref] = " << *first1);
-                    ARM_COMPUTE_TEST_INFO("keypoint2 [tgt] = " << *first2);
-                    framework::ARM_COMPUTE_PRINT_INFO();
-                }
-                ++first1;
-            }
-            ++first2;
-        }
-    }
-
-    if(rest_missing)
-    {
-        while(first1 != last1)
-        {
-            ++num_missing;
-            ARM_COMPUTE_TEST_INFO("Key point not found");
-            ARM_COMPUTE_TEST_INFO("keypoint1 = " << *first1++);
-            framework::ARM_COMPUTE_PRINT_INFO();
-        }
-    }
-
-    return std::make_pair(num_missing, num_mismatches);
-}
-
-template <typename T, typename U, typename V>
-void validate_keypoints(T target_first, T target_last, U reference_first, U reference_last, V tolerance, float allowed_missing_percentage, float allowed_mismatch_percentage)
-{
-    if(framework::Framework::get().configure_only() && framework::Framework::get().new_fixture_call())
-    {
-        return;
-    }
-
-    const int64_t num_elements_target    = std::distance(target_first, target_last);
-    const int64_t num_elements_reference = std::distance(reference_first, reference_last);
-
-    int64_t num_missing    = 0;
-    int64_t num_mismatches = 0;
-
-    if(num_elements_reference > 0)
-    {
-        std::tie(num_missing, num_mismatches) = compare_keypoints(reference_first, reference_last, target_first, target_last, tolerance);
-
-        const float percent_missing    = static_cast<float>(num_missing) / num_elements_reference * 100.f;
-        const float percent_mismatches = static_cast<float>(num_mismatches) / num_elements_reference * 100.f;
-
-        ARM_COMPUTE_TEST_INFO(num_missing << " keypoints (" << std::fixed << std::setprecision(2) << percent_missing << "%) in ref are missing from target");
-        ARM_COMPUTE_TEST_INFO("Missing (not in tgt): " << num_missing << "/" << num_elements_reference << " = " << std::fixed << std::setprecision(2) << percent_missing
-                              << "% \tMax allowed: " << allowed_missing_percentage << "%");
-        ARM_COMPUTE_EXPECT(percent_missing <= allowed_missing_percentage, framework::LogLevel::ERRORS);
-
-        ARM_COMPUTE_TEST_INFO(num_mismatches << " keypoints (" << std::fixed << std::setprecision(2) << percent_mismatches << "%) mismatched");
-        ARM_COMPUTE_TEST_INFO("Mismatched keypoints: " << num_mismatches << "/" << num_elements_reference << " = " << std::fixed << std::setprecision(2) << percent_mismatches
-                              << "% \tMax allowed: " << allowed_mismatch_percentage << "%");
-        ARM_COMPUTE_EXPECT(percent_mismatches <= allowed_mismatch_percentage, framework::LogLevel::ERRORS);
-    }
-
-    if(num_elements_target > 0)
-    {
-        // Note: no need to check for mismatches a second time (last argument is 'false')
-        std::tie(num_missing, num_mismatches) = compare_keypoints(target_first, target_last, reference_first, reference_last, tolerance, false);
-
-        const float percent_missing = static_cast<float>(num_missing) / num_elements_target * 100.f;
-
-        ARM_COMPUTE_TEST_INFO(num_missing << " keypoints (" << std::fixed << std::setprecision(2) << percent_missing << "%) in target are missing from ref");
-        ARM_COMPUTE_TEST_INFO("Missing (not in ref): " << num_missing << "/" << num_elements_target << " = " << std::fixed << std::setprecision(2) << percent_missing
-                              << "% \tMax allowed: " << allowed_missing_percentage << "%");
-        ARM_COMPUTE_EXPECT(percent_missing <= allowed_missing_percentage, framework::LogLevel::ERRORS);
-    }
-}
-
-/** Check which detection windows from [first1, last1) are missing in [first2, last2) */
-template <typename T, typename U, typename V>
-std::pair<int64_t, int64_t> compare_detection_windows(T first1, T last1, U first2, U last2, V tolerance)
-{
-    int64_t num_missing    = 0;
-    int64_t num_mismatches = 0;
-
-    while(first1 != last1)
-    {
-        const auto window = std::find_if(first2, last2, [&](DetectionWindow window)
-        {
-            return window.x == first1->x && window.y == first1->y && window.width == first1->width && window.height == first1->height && window.idx_class == first1->idx_class;
-        });
-
-        if(window == last2)
-        {
-            ++num_missing;
-            ARM_COMPUTE_TEST_INFO("Detection window not found " << *first1)
-            framework::ARM_COMPUTE_PRINT_INFO();
-        }
-        else
-        {
-            if(!compare<V>(window->score, first1->score, tolerance))
-            {
-                ++num_mismatches;
-                ARM_COMPUTE_TEST_INFO("Mismatching detection window")
-                ARM_COMPUTE_TEST_INFO("detection window 1= " << *first1)
-                ARM_COMPUTE_TEST_INFO("detection window 2= " << *window)
-                framework::ARM_COMPUTE_PRINT_INFO();
-            }
-        }
-
-        ++first1;
-    }
-
-    return std::make_pair(num_missing, num_mismatches);
-}
-
-template <typename T, typename U, typename V>
-void validate_detection_windows(T target_first, T target_last, U reference_first, U reference_last, V tolerance,
-                                float allowed_missing_percentage, float allowed_mismatch_percentage)
-{
-    if(framework::Framework::get().configure_only() && framework::Framework::get().new_fixture_call())
-    {
-        return;
-    }
-
-    const int64_t num_elements_target    = std::distance(target_first, target_last);
-    const int64_t num_elements_reference = std::distance(reference_first, reference_last);
-
-    int64_t num_missing    = 0;
-    int64_t num_mismatches = 0;
-
-    if(num_elements_reference > 0)
-    {
-        std::tie(num_missing, num_mismatches) = compare_detection_windows(reference_first, reference_last, target_first, target_last, tolerance);
-
-        const float percent_missing    = static_cast<float>(num_missing) / num_elements_reference * 100.f;
-        const float percent_mismatches = static_cast<float>(num_mismatches) / num_elements_reference * 100.f;
-
-        ARM_COMPUTE_TEST_INFO(num_missing << " detection windows (" << std::fixed << std::setprecision(2) << percent_missing << "%) are missing in target");
-        ARM_COMPUTE_EXPECT(percent_missing <= allowed_missing_percentage, framework::LogLevel::ERRORS);
-
-        ARM_COMPUTE_TEST_INFO(num_mismatches << " detection windows (" << std::fixed << std::setprecision(2) << percent_mismatches << "%) mismatched");
-        ARM_COMPUTE_EXPECT(percent_mismatches <= allowed_mismatch_percentage, framework::LogLevel::ERRORS);
-    }
-
-    if(num_elements_target > 0)
-    {
-        std::tie(num_missing, num_mismatches) = compare_detection_windows(target_first, target_last, reference_first, reference_last, tolerance);
-
-        const float percent_missing = static_cast<float>(num_missing) / num_elements_target * 100.f;
-
-        ARM_COMPUTE_TEST_INFO(num_missing << " detection windows (" << std::fixed << std::setprecision(2) << percent_missing << "%) are not part of target");
-        ARM_COMPUTE_EXPECT(percent_missing <= allowed_missing_percentage, framework::LogLevel::ERRORS);
-    }
-}
-
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/cpu/unit/Context.cpp b/tests/validation/cpu/unit/Context.cpp
index bf2a02df5d..42247ba1da 100644
--- a/tests/validation/cpu/unit/Context.cpp
+++ b/tests/validation/cpu/unit/Context.cpp
@@ -21,11 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-
-#include "arm_compute/Acl.hpp"
+#include "tests/validation/fixtures/UNIT/ContextFixture.h"
 
 #include "src/cpu/CpuContext.h"
 
@@ -78,92 +74,10 @@ TEST_CASE(CreateContextWithInvalidOptions, framework::DatasetMode::ALL)
     ARM_COMPUTE_ASSERT(ctx == nullptr);
 }
 
-/** Test-case for AclDestroyContext
- *
- * Validate that AclDestroyContext behaves as expected when invalid inputs as context are given
- *
- * Test Steps:
- *  - Call AclDestroyContext with null context
- *  - Confirm that AclInvalidArgument is reported
- *  - Call AclDestroyContext on empty array
- *  - Confirm that AclInvalidArgument is reported
- *  - Call AclDestroyContext on an ACL object other than AclContext
- *  - Confirm that AclInvalidArgument is reported
- *  - Confirm that context is still nullptr
- */
-TEST_CASE(DestroyInvalidContext, framework::DatasetMode::ALL)
-{
-    AclContext ctx = nullptr;
-    std::array<char, 256> empty_array{};
-    AclContext valid_ctx = nullptr;
-    ARM_COMPUTE_ASSERT(AclCreateContext(&valid_ctx, AclCpu, nullptr) == AclStatus::AclSuccess);
-    ARM_COMPUTE_ASSERT(AclDestroyContext(ctx) == AclStatus::AclInvalidArgument);
-    ARM_COMPUTE_ASSERT(AclDestroyContext(reinterpret_cast<AclContext>(empty_array.data())) == AclStatus::AclInvalidArgument);
-    ARM_COMPUTE_ASSERT(ctx == nullptr);
-    ARM_COMPUTE_ASSERT(AclDestroyContext(valid_ctx) == AclStatus::AclSuccess);
-}
-
-/** Test-case for AclCreateContext and AclDestroy Context
- *
- * Validate that AclCreateContext can create and destroy a context
- *
- * Test Steps:
- *  - Call AclCreateContext with valid target
- *  - Confirm that context is not nullptr and error code is AclSuccess
- *  - Destroy context
- *  - Confirm that AclSuccess is reported
- */
-TEST_CASE(SimpleContextCApi, framework::DatasetMode::ALL)
-{
-    AclContext ctx = nullptr;
-    ARM_COMPUTE_ASSERT(AclCreateContext(&ctx, AclCpu, nullptr) == AclStatus::AclSuccess);
-    ARM_COMPUTE_ASSERT(ctx != nullptr);
-    ARM_COMPUTE_ASSERT(AclDestroyContext(ctx) == AclStatus::AclSuccess);
-}
-
-/** Test-case for Context from the C++ interface
- *
- * Test Steps:
- *  - Create a Context obejct
- *  - Confirm that StatusCode::Success is reported
- *  - Confirm that equality operator works
- *  - Confirm that inequality operator works
- */
-TEST_CASE(SimpleContextCppApi, framework::DatasetMode::ALL)
-{
-    acl::StatusCode status = acl::StatusCode::Success;
-    acl::Context    ctx(acl::Target::Cpu, &status);
-    ARM_COMPUTE_ASSERT(status == acl::StatusCode::Success);
-
-    auto ctx_eq = ctx;
-    ARM_COMPUTE_ASSERT(ctx_eq == ctx);
-
-    acl::Context ctx_ienq(acl::Target::Cpu, &status);
-    ARM_COMPUTE_ASSERT(status == acl::StatusCode::Success);
-    ARM_COMPUTE_ASSERT(ctx_ienq != ctx);
-}
-
-/** Test-case for CpuCapabilities
- *
- * Validate that AclCreateContext can create/destroy multiple contexts with different options
- *
- * Test Steps:
- *  - Call AclCreateContext with different targets
- *  - Confirm that AclSuccess is reported
- *  - Destroy all contexts
- *  - Confirm that AclSuccess is reported
- */
-TEST_CASE(MultipleContexts, framework::DatasetMode::ALL)
-{
-    const unsigned int num_tests = 5;
-    std::array<AclContext, num_tests> ctxs{};
-    for(unsigned int i = 0; i < num_tests; ++i)
-    {
-        ARM_COMPUTE_ASSERT(AclCreateContext(&ctxs[i], AclTarget::AclCpu, nullptr) == AclStatus::AclSuccess);
-        ARM_COMPUTE_ASSERT(ctxs[i] != nullptr);
-        ARM_COMPUTE_ASSERT(AclDestroyContext(ctxs[i]) == AclStatus::AclSuccess);
-    }
-}
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidContext, DestroyInvalidContextFixture<AclTarget::AclCpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleContextCApi, SimpleContextCApiFixture<AclTarget::AclCpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleContextCppApi, SimpleContextCppApiFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MultipleContexts, MultipleContextsFixture<AclTarget::AclCpu>, framework::DatasetMode::ALL)
 
 /** Test-case for CpuCapabilities
  *
@@ -176,17 +90,17 @@ TEST_CASE(MultipleContexts, framework::DatasetMode::ALL)
  */
 TEST_CASE(CpuCapabilities, framework::DatasetMode::ALL)
 {
-    AclContextOptions opts = acl_default_ctx_options;
-    opts.capabilities      = AclCpuCapabilitiesDot | AclCpuCapabilitiesMmlaInt8 | AclCpuCapabilitiesSve2;
-    arm_compute::cpu::CpuContext ctx(&opts);
+    acl::Context::Options opts;
+    opts.copts.capabilities = AclCpuCapabilitiesDot | AclCpuCapabilitiesMmlaInt8 | AclCpuCapabilitiesSve2;
+    arm_compute::cpu::CpuContext ctx(&opts.copts);
 
-    ARM_COMPUTE_ASSERT(ctx.capabilities().dot == true);
-    ARM_COMPUTE_ASSERT(ctx.capabilities().mmla_int8 == true);
-    ARM_COMPUTE_ASSERT(ctx.capabilities().sve2 == true);
-    ARM_COMPUTE_ASSERT(ctx.capabilities().fp16 == false);
+    ARM_COMPUTE_ASSERT(ctx.capabilities().cpu_info.has_dotprod() == true);
+    ARM_COMPUTE_ASSERT(ctx.capabilities().cpu_info.has_i8mm() == true);
+    ARM_COMPUTE_ASSERT(ctx.capabilities().cpu_info.has_sve2() == true);
+    ARM_COMPUTE_ASSERT(ctx.capabilities().cpu_info.has_fp16() == false);
 
     arm_compute::cpu::CpuContext ctx_legacy(nullptr);
-    ARM_COMPUTE_ASSERT(ctx_legacy.capabilities().neon == true);
+    ARM_COMPUTE_ASSERT(ctx_legacy.capabilities().cpu_info.has_neon() == true);
 }
 
 TEST_SUITE_END() // Context
diff --git a/tests/validation/reference/HOGDetector.h b/tests/validation/cpu/unit/Queue.cpp
index 9809ae384f..7d977cc48e 100644
--- a/tests/validation/reference/HOGDetector.h
+++ b/tests/validation/cpu/unit/Queue.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_HOG_DETECTOR_H
-#define ARM_COMPUTE_TEST_HOG_DETECTOR_H
+#include "tests/validation/fixtures/UNIT/QueueFixture.h"
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "tests/SimpleTensor.h"
-
-#include <vector>
+#include "src/cpu/CpuQueue.h"
 
 namespace arm_compute
 {
@@ -36,13 +31,18 @@ namespace test
 {
 namespace validation
 {
-namespace reference
-{
-template <typename T>
-std::vector<DetectionWindow> hog_detector(const SimpleTensor<T> &src, const std::vector<T> &descriptor, unsigned int max_num_detection_windows,
-                                          const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0);
-} // namespace reference
+TEST_SUITE(CPU)
+TEST_SUITE(UNIT)
+TEST_SUITE(Queue)
+
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateQueueWithInvalidContext, CreateQueueWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateQueuerWithInvalidOptions, CreateQueuerWithInvalidOptionsFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidQueue, DestroyInvalidQueueFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleQueue, SimpleQueueFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+
+TEST_SUITE_END() // Queue
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CPU
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HOG_DETECTOR_H */
diff --git a/tests/validation/cpu/unit/Tensor.cpp b/tests/validation/cpu/unit/Tensor.cpp
new file mode 100644
index 0000000000..cc0c55758f
--- /dev/null
+++ b/tests/validation/cpu/unit/Tensor.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "tests/validation/fixtures/UNIT/TensorFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CPU)
+TEST_SUITE(UNIT)
+TEST_SUITE(Tensor)
+
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorWithInvalidContext, CreateTensorWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorWithInvalidDescriptor, CreateTensorWithInvalidDescriptorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidTensor, DestroyInvalidTensorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleTensor, SimpleTensorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(TensorStress, TensorStressFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MapInvalidTensor, MapInvalidTensorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MapNotAllocatedTensor, MapNotAllocatedTensorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MapAllocatedTensor, MapAllocatedTensorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(ImportMemory, ImportMemoryFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetSize, TensorSizeFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetInvalidSize, InvalidTensorSizeFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetDescriptor, DescriptorConversionFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetInvalidDescriptor, InvalidDescriptorConversionFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+
+TEST_SUITE_END() // Tensor
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CPU
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/HOGMultiDetection.h b/tests/validation/cpu/unit/TensorPack.cpp
index 7194af70c7..f019e8e3c4 100644
--- a/tests/validation/reference/HOGMultiDetection.h
+++ b/tests/validation/cpu/unit/TensorPack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_H
-#define ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_H
-
-#include "arm_compute/core/Types.h"
-#include "tests/SimpleTensor.h"
-
-#include <vector>
+#include "tests/validation/fixtures/UNIT/TensorPackFixture.h"
 
 namespace arm_compute
 {
@@ -35,14 +29,19 @@ namespace test
 {
 namespace validation
 {
-namespace reference
-{
-template <typename T>
-std::vector<DetectionWindow> hog_multi_detection(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value,
-                                                 const std::vector<HOGInfo> &models, std::vector<std::vector<float>> descriptors,
-                                                 unsigned int max_num_detection_windows, float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
-} // namespace reference
+TEST_SUITE(CPU)
+TEST_SUITE(UNIT)
+TEST_SUITE(TensorPack)
+
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorPackWithInvalidContext, CreateTensorPackWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidTensorPack, DestroyInvalidTensorPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(AddInvalidObjectToTensorPack, AddInvalidObjectToTensorPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleTensorPack, SimpleTensorPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MultipleTensorsInPack, MultipleTensorsInPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+
+TEST_SUITE_END() // Tensor
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CPU
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_H */
diff --git a/tests/validation/dynamic_fusion/Utils.h b/tests/validation/dynamic_fusion/Utils.h
new file mode 100644
index 0000000000..72e9ec5955
--- /dev/null
+++ b/tests/validation/dynamic_fusion/Utils.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef TESTS_VALIDATION_DYNAMIC_FUSION_UTILS
+#define TESTS_VALIDATION_DYNAMIC_FUSION_UTILS
+
+#include "tests/AssetsLibrary.h"
+#include "utils/Utils.h"
+
+#include <chrono>
+#include <limits>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace utils
+{
+/** A pair of macros which measures the wall clock time, and records it into a map measurement_map with name clock_name
+ *
+ */
+#define TICK(clock_name) \
+    auto clock_name##_tick = std::chrono::high_resolution_clock::now();
+#define TOCK(clock_name, measurement_map)                                               \
+    auto clock_name##_tock                 = std::chrono::high_resolution_clock::now(); \
+    measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
+#define TOCK_AVG(clock_name, measurement_map, num_iterations)                           \
+    auto clock_name##_tock                 = std::chrono::high_resolution_clock::now(); \
+    measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
+
+template <typename T, typename U>
+void fill(U &&tensor, int seed, AssetsLibrary *library)
+{
+    static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
+    using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+
+    DistributionType distribution{ T(-1.0f), T(1.0f) };
+    library->fill(tensor, distribution, seed);
+
+    // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
+    DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
+    library->fill_borders_with_garbage(tensor, distribution_inf, seed);
+}
+} // namespace utils
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif /* TESTS_VALIDATION_DYNAMIC_FUSION_UTILS */
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
new file mode 100644
index 0000000000..453983c077
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -0,0 +1,642 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/dynamic_fusion/Utils.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/DepthConvertLayer.h"
+#include "tests/validation/reference/DepthwiseConvolutionLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/PixelWiseMultiplication.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+using namespace arm_compute::test::validation::utils;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(INTEGRATION)
+TEST_SUITE(DYNAMIC_FUSION)
+
+TEST_CASE(Conv2d, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = conv2d1x1(direct_conv)(input, weights, bias)
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto data_layout    = DataLayout::NHWC;
+    const auto t_input_shape  = TensorShape(384, 12, 12);
+    const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+    const auto t_dst_shape    = TensorShape(16, 12, 12);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    // Fuse conv2d
+    Conv2dAttributes conv2d_attr{};
+    ITensorInfo     *input_info  = context.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+    ITensorInfo     *weight_info = context.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+
+    ITensorInfo *conv_out_info = GpuConv2d::create_op(sketch, input_info, weight_info, nullptr, conv2d_attr);
+
+    ITensorInfo *dst_info = context.create_tensor_info();
+    GpuOutput::create_op(sketch, conv_out_info, dst_info);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for (auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = std::get<0>(data);
+        TensorInfo    info        = std::get<1>(data);
+        AuxMemoryInfo aux_mem_req = std::get<2>(data);
+        tensor->allocator()->init(info, aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_input{};
+    CLTensor t_weight{};
+    CLTensor t_dst{};
+
+    // Initialize user tensors
+    t_input.allocator()->init(*input_info);
+    t_weight.allocator()->init(*weight_info);
+    t_dst.allocator()->init(*dst_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_input.allocator()->allocate();
+    t_weight.allocator()->allocate();
+    t_dst.allocator()->allocate();
+    fill<float>(CLAccessor(t_input), 0, library.get());
+    fill<float>(CLAccessor(t_weight), 1, library.get());
+
+    // Run runtime
+    runtime.run({&t_input, &t_weight, &t_dst});
+
+    // Create reference
+    SimpleTensor<float> ref_t_input{t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC};
+    SimpleTensor<float> ref_t_weight{t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC};
+    SimpleTensor<float> ref_t_bias_placeholder{t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC};
+
+    // Fill reference
+    fill<float>(ref_t_input, 0, library.get());
+    fill<float>(ref_t_weight, 1, library.get());
+
+    auto ref_t_input_nchw            = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
+    auto ref_t_weight_nchw           = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U));
+    auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U));
+    auto t_dst_shape_nchw            = t_dst_shape;
+    permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+    PadStrideInfo legacy_pad_stride(conv2d_attr.stride().x(), conv2d_attr.stride().y(), conv2d_attr.pad().left,
+                                    conv2d_attr.pad().right, conv2d_attr.pad().top, conv2d_attr.pad().bottom,
+                                    DimensionRoundingType{});
+    auto ref_t_dst_nchw = reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw,
+                                                       t_dst_shape_nchw, legacy_pad_stride, conv2d_attr.dilation());
+    const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
+
+    RelativeTolerance<float> tolerance_f32(
+        0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
+}
+
+TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     *   out_0 = in_0 + in_1
+     *   out_1 = out_0 + in_2
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type     = DataType::F32;
+    const auto t_input_shape = TensorShape(33, 3, 2);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    ITensorInfo *in_0_info = context.create_tensor_info(t_input_shape, 1, data_type);
+    ITensorInfo *in_1_info = context.create_tensor_info(t_input_shape, 1, data_type);
+    ITensorInfo *in_2_info = context.create_tensor_info(t_input_shape, 1, data_type);
+
+    ITensorInfo *out_0_info = context.create_tensor_info();
+    ITensorInfo *out_1_info = context.create_tensor_info();
+
+    ITensorInfo *ans_0_info = GpuAdd::create_op(sketch, in_0_info, in_1_info);
+    GpuOutput::create_op(sketch, ans_0_info, out_0_info);
+    ITensorInfo *ans_1_info = GpuAdd::create_op(sketch, ans_0_info, in_2_info);
+    GpuOutput::create_op(sketch, ans_1_info, out_1_info);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for (auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = std::get<0>(data);
+        TensorInfo    info        = std::get<1>(data);
+        AuxMemoryInfo aux_mem_req = std::get<2>(data);
+        tensor->allocator()->init(info, aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_in_0{};
+    CLTensor t_in_1{};
+    CLTensor t_in_2{};
+
+    CLTensor t_out_0{};
+    CLTensor t_out_1{};
+
+    // Initialize user tensors
+    t_in_0.allocator()->init(*in_0_info);
+    t_in_1.allocator()->init(*in_1_info);
+    t_in_2.allocator()->init(*in_2_info);
+
+    t_out_0.allocator()->init(*out_0_info);
+    t_out_1.allocator()->init(*out_1_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_in_0.allocator()->allocate();
+    t_in_1.allocator()->allocate();
+    t_in_2.allocator()->allocate();
+
+    t_out_0.allocator()->allocate();
+    t_out_1.allocator()->allocate();
+
+    fill<float>(CLAccessor(t_in_0), 0, library.get());
+    fill<float>(CLAccessor(t_in_1), 1, library.get());
+    fill<float>(CLAccessor(t_in_2), 2, library.get());
+
+    // Run runtime
+    runtime.run({&t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1});
+
+    // Create reference
+    SimpleTensor<float> ref_t_in_0{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_in_1{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_in_2{t_input_shape, data_type, 1, QuantizationInfo()};
+
+    SimpleTensor<float> ref_t_out_0{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_out_1{t_input_shape, data_type, 1, QuantizationInfo()};
+
+    // Fill reference
+    fill<float>(ref_t_in_0, 0, library.get());
+    fill<float>(ref_t_in_1, 1, library.get());
+    fill<float>(ref_t_in_2, 2, library.get());
+
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP);
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_out_1,
+                                    ConvertPolicy::WRAP);
+
+    RelativeTolerance<float> tolerance_f32(
+        0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32);
+    validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32);
+}
+TEST_CASE(Add_Output_Add_Cast_Cast_Output, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     *   out_0 = in_0 + in_1
+     *   out_1 = float(int32_t(out_0 + in_2))
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type     = DataType::F32;
+    const auto t_input_shape = TensorShape(3, 8, 5);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    ITensorInfo *in_0_info = context.create_tensor_info(t_input_shape, 1, data_type);
+    ITensorInfo *in_1_info = context.create_tensor_info(t_input_shape, 1, data_type);
+    ITensorInfo *in_2_info = context.create_tensor_info(t_input_shape, 1, data_type);
+
+    ITensorInfo *out_0_info = context.create_tensor_info();
+    ITensorInfo *out_1_info = context.create_tensor_info();
+
+    CastAttributes cast_0_attr;
+    cast_0_attr.data_type(DataType::F16);
+
+    CastAttributes cast_1_attr;
+    cast_1_attr.data_type(DataType::F32);
+
+    ITensorInfo *ans_0_info = GpuAdd::create_op(sketch, in_0_info, in_1_info);
+    GpuOutput::create_op(sketch, ans_0_info, out_0_info);
+    ITensorInfo *ans_1_info = GpuAdd::create_op(sketch, ans_0_info, in_2_info);
+    ITensorInfo *ans_2_info = GpuCast::create_op(sketch, ans_1_info, cast_0_attr);
+    ITensorInfo *ans_3_info = GpuCast::create_op(sketch, ans_2_info, cast_1_attr);
+    GpuOutput::create_op(sketch, ans_3_info, out_1_info);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for (auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = std::get<0>(data);
+        TensorInfo    info        = std::get<1>(data);
+        AuxMemoryInfo aux_mem_req = std::get<2>(data);
+        tensor->allocator()->init(info, aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_in_0{};
+    CLTensor t_in_1{};
+    CLTensor t_in_2{};
+
+    CLTensor t_out_0{};
+    CLTensor t_out_1{};
+
+    // Initialize user tensors
+    t_in_0.allocator()->init(*in_0_info);
+    t_in_1.allocator()->init(*in_1_info);
+    t_in_2.allocator()->init(*in_2_info);
+
+    t_out_0.allocator()->init(*out_0_info);
+    t_out_1.allocator()->init(*out_1_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_in_0.allocator()->allocate();
+    t_in_1.allocator()->allocate();
+    t_in_2.allocator()->allocate();
+
+    t_out_0.allocator()->allocate();
+    t_out_1.allocator()->allocate();
+
+    fill<float>(CLAccessor(t_in_0), 0, library.get());
+    fill<float>(CLAccessor(t_in_1), 1, library.get());
+    fill<float>(CLAccessor(t_in_2), 2, library.get());
+
+    // Run runtime
+    runtime.run({&t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1});
+
+    // Create reference
+    SimpleTensor<float> ref_t_in_0{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_in_1{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_in_2{t_input_shape, data_type, 1, QuantizationInfo()};
+
+    SimpleTensor<float> ref_t_out_0{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_ans_1{t_input_shape, data_type, 1, QuantizationInfo()};
+
+    // Fill reference
+    fill<float>(ref_t_in_0, 0, library.get());
+    fill<float>(ref_t_in_1, 1, library.get());
+    fill<float>(ref_t_in_2, 2, library.get());
+
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP);
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_ans_1,
+                                    ConvertPolicy::WRAP);
+    const auto ref_t_ans_2 =
+        reference::depth_convert<float, int32_t>(ref_t_ans_1, DataType::S32, ConvertPolicy::SATURATE, 0);
+    const auto ref_t_out_1 =
+        reference::depth_convert<int32_t, float>(ref_t_ans_2, DataType::F32, ConvertPolicy::SATURATE, 0);
+
+    RelativeTolerance<float> tolerance_add_f32(0.001f);
+    AbsoluteTolerance<float> tolerance_cast_f32(1.0f);
+    validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_add_f32);
+    validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_cast_f32);
+}
+
+/// TODO: COMPMID-6593 : This integration test fails with CKW backend.
+/// It was not enabled for CKW before, therefore went unnoticed.
+TEST_CASE(Conv2d_Sigmoid_DepthwiseConv2d_Mul, framework::DatasetMode::DISABLED)
+{
+    //   (tensor0)
+    //       |
+    // ======|============================================== Sketch 0
+    //       |     (tensor1)     +---- (tensor2)
+    //       |         |         |         |
+    // +-- input -- weights -- biases --+  |
+    // |                                |  |
+    // |            Conv2d              |  |
+    // |                                |  |
+    // +----------- output -------------+  |
+    //                |                    |
+    //          +-- input ---+             |
+    //          |            |             |
+    //          |  Sigmoid   |             |
+    //          |            |             |
+    //          +-- output --+             |
+    //                |                    |
+    //          +-- input ---+             |
+    //          |            |             |
+    //          |   Output   |             |
+    //          |            |             |
+    //          +-- output --+             |
+    //                |                    |
+    //            (tensor5)                |
+    //                |                    |
+    //       +--------+                    |
+    // ======|=============================|================ Sketch 1
+    //       |     (tensor3) (tensor4)     |
+    //       |         |         |         |
+    // +-- input -- weights -- biases --+  |
+    // |                                |  |
+    // |        DepthwiseConv2d         |  |
+    // |                                |  |
+    // +----------- output -------------+  |
+    //                |                    |
+    //             +--+   +----------------+
+    //             |      |
+    //        +-- lhs -- rhs --+
+    //        |                |
+    //        |    Multiply    |
+    //        |                |
+    //        +---- output ----+
+    //                |
+    //          +-- input ---+
+    //          |            |
+    //          |   Output   |
+    //          |            |
+    //          +-- output --+
+    //                |
+    //            (tensor6)
+
+    TensorShape conv2d_src_shape(10, 20, 30);
+    TensorShape conv2d_wei_shape(10, 3, 3, 5);
+    TensorShape conv2d_bia_shape(5);
+    TensorShape conv2d_dst_shape(5, 18, 28);
+    TensorShape dwc_wei_shape(5, 3, 3);
+    TensorShape dwc_bia_shape(5);
+    TensorShape dwc_dst_shape(5, 16, 26);
+
+    // Initialize the context.
+    CLScheduler::get().default_reinit();
+
+    auto               cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context(&cl_compile_ctx);
+
+    auto tensor0_info = context.create_tensor_info(conv2d_src_shape, 1, DataType::F32, DataLayout::NHWC);
+
+    // Create the first sketch: conv2d + cast + output.
+    GpuWorkloadSketch sketch0(&context);
+
+    Conv2dAttributes conv2d_attr;
+    auto             tensor1_info = context.create_tensor_info(conv2d_wei_shape, 1, DataType::F32, DataLayout::NHWC);
+    auto             tensor2_info = context.create_tensor_info(conv2d_bia_shape, 1, DataType::F32, DataLayout::NHWC);
+    ARM_COMPUTE_EXPECT(GpuConv2d::validate_op(sketch0, tensor0_info, tensor1_info, tensor2_info, conv2d_attr),
+                       framework::LogLevel::ERRORS);
+    auto ans_info = GpuConv2d::create_op(sketch0, tensor0_info, tensor1_info, tensor2_info, conv2d_attr);
+
+    ARM_COMPUTE_EXPECT(GpuSigmoid::validate_op(sketch0, ans_info), framework::LogLevel::ERRORS);
+    ans_info = GpuSigmoid::create_op(sketch0, ans_info);
+
+    DepthwiseConv2dAttributes dwc_attr;
+    auto tensor3_info = context.create_tensor_info(dwc_wei_shape, 1, DataType::F32, DataLayout::NHWC);
+    auto tensor4_info = context.create_tensor_info(dwc_bia_shape, 1, DataType::F32, DataLayout::NHWC);
+    ARM_COMPUTE_EXPECT(!GpuDepthwiseConv2d::validate_op(sketch0, ans_info, tensor3_info, tensor4_info, dwc_attr),
+                       framework::LogLevel::ERRORS);
+
+    auto tensor5_info = context.create_tensor_info();
+    ARM_COMPUTE_EXPECT(GpuOutput::validate_op(sketch0, ans_info, tensor5_info), framework::LogLevel::ERRORS);
+    GpuOutput::create_op(sketch0, ans_info, tensor5_info);
+
+    // Create the first workload runtime.
+    ClWorkloadRuntime runtime0;
+    runtime0.configure(sketch0);
+
+    // Create the second sketch: dwc + sigmoid + output.
+    GpuWorkloadSketch sketch1(&context);
+
+    ARM_COMPUTE_EXPECT(GpuDepthwiseConv2d::validate_op(sketch1, tensor5_info, tensor3_info, tensor4_info, dwc_attr),
+                       framework::LogLevel::ERRORS);
+    ans_info = GpuDepthwiseConv2d::create_op(sketch1, tensor5_info, tensor3_info, tensor4_info, dwc_attr);
+
+    ARM_COMPUTE_EXPECT(GpuMul::validate_op(sketch1, ans_info, tensor2_info), framework::LogLevel::ERRORS);
+    ans_info = GpuMul::create_op(sketch1, ans_info, tensor2_info);
+
+    auto tensor6_info = context.create_tensor_info();
+    ARM_COMPUTE_EXPECT(GpuOutput::validate_op(sketch1, ans_info, tensor6_info), framework::LogLevel::ERRORS);
+    GpuOutput::create_op(sketch1, ans_info, tensor6_info);
+
+    // Create the second workload runtime.
+    ClWorkloadRuntime runtime1;
+    runtime1.configure(sketch1);
+
+    // Create the user tensors.
+    CLTensor tensor0;
+    CLTensor tensor1;
+    CLTensor tensor2;
+    CLTensor tensor3;
+    CLTensor tensor4;
+    CLTensor tensor5;
+    CLTensor tensor6;
+
+    tensor0.allocator()->init(*tensor0_info);
+    tensor1.allocator()->init(*tensor1_info);
+    tensor2.allocator()->init(*tensor2_info);
+    tensor3.allocator()->init(*tensor3_info);
+    tensor4.allocator()->init(*tensor4_info);
+    tensor5.allocator()->init(*tensor5_info);
+    tensor6.allocator()->init(*tensor6_info);
+
+    tensor0.allocator()->allocate();
+    tensor1.allocator()->allocate();
+    tensor2.allocator()->allocate();
+    tensor3.allocator()->allocate();
+    tensor4.allocator()->allocate();
+    tensor5.allocator()->allocate();
+    tensor6.allocator()->allocate();
+
+    // Allocate the auxiliary tensors.
+    for (auto &data : runtime0.get_auxiliary_tensors())
+    {
+        auto  tensor      = std::get<0>(data);
+        auto &tensor_info = std::get<1>(data);
+        auto  mem_req     = std::get<2>(data);
+
+        tensor->allocator()->init(tensor_info, mem_req.alignment);
+        tensor->allocator()->allocate();
+    }
+
+    for (auto &data : runtime1.get_auxiliary_tensors())
+    {
+        auto  tensor      = std::get<0>(data);
+        auto &tensor_info = std::get<1>(data);
+        auto  mem_req     = std::get<2>(data);
+
+        tensor->allocator()->init(tensor_info, mem_req.alignment);
+        tensor->allocator()->allocate();
+    }
+
+    // Fill the input tensors with random data.
+    fill<float>(CLAccessor(tensor0), 0, library.get());
+    fill<float>(CLAccessor(tensor1), 1, library.get());
+    fill<float>(CLAccessor(tensor2), 2, library.get());
+    fill<float>(CLAccessor(tensor3), 3, library.get());
+    fill<float>(CLAccessor(tensor4), 4, library.get());
+
+    // Run each runtime.
+    runtime0.run({&tensor0, &tensor1, &tensor2, &tensor5});
+    runtime1.run({&tensor5, &tensor3, &tensor4, &tensor2, &tensor6});
+
+    // Compute the reference result.
+    SimpleTensor<float> ref_conv2d_src(conv2d_src_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC);
+    SimpleTensor<float> ref_conv2d_wei(conv2d_wei_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC);
+    SimpleTensor<float> ref_conv2d_bia(conv2d_bia_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC);
+    SimpleTensor<float> ref_dwc_wei(dwc_wei_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC);
+    SimpleTensor<float> ref_dwc_bia(dwc_bia_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC);
+
+    fill<float>(ref_conv2d_src, 0, library.get());
+    fill<float>(ref_conv2d_wei, 1, library.get());
+    fill<float>(ref_conv2d_bia, 2, library.get());
+    fill<float>(ref_dwc_wei, 3, library.get());
+    fill<float>(ref_dwc_bia, 4, library.get());
+
+    PermutationVector nhwc_to_nchw(1, 2, 0);
+
+    auto conv2d_dst_shape_nchw = conv2d_dst_shape;
+    permute(conv2d_dst_shape_nchw, nhwc_to_nchw);
+    const auto ref_conv2d_src_nchw = reference::permute(ref_conv2d_src, nhwc_to_nchw);
+    const auto ref_conv2d_wei_nchw = reference::permute(ref_conv2d_wei, nhwc_to_nchw);
+    const auto ref_conv2d_bia_nchw = reference::permute(ref_conv2d_bia, nhwc_to_nchw);
+    const auto ref_conv2d_dst_nchw = reference::convolution_layer(
+        ref_conv2d_src_nchw, ref_conv2d_wei_nchw, ref_conv2d_bia_nchw, conv2d_dst_shape_nchw, PadStrideInfo());
+
+    const auto ref_sigmoid_dst_nchw = reference::activation_layer(
+        ref_conv2d_dst_nchw, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+
+    auto dwc_dst_shape_nchw = dwc_dst_shape;
+    permute(dwc_dst_shape_nchw, nhwc_to_nchw);
+    const auto ref_dwc_wei_nchw = reference::permute(ref_dwc_wei, nhwc_to_nchw);
+    const auto ref_dwc_bia_nchw = reference::permute(ref_dwc_bia, nhwc_to_nchw);
+    const auto ref_dwc_dst_nchw = reference::depthwise_convolution(
+        ref_sigmoid_dst_nchw, ref_dwc_wei_nchw, ref_dwc_bia_nchw, dwc_dst_shape_nchw, PadStrideInfo(), 1);
+
+    const auto ref_mul_dst_nchw = reference::pixel_wise_multiplication<float, float, float>(
+        ref_dwc_dst_nchw, ref_conv2d_bia_nchw, 1.0, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_UP,
+        DataType::F32);
+
+    constexpr RelativeTolerance<float> tolerance(0.001f);
+    validate(CLAccessor(tensor6), ref_mul_dst_nchw, tolerance);
+}
+
+TEST_SUITE(Invalid_Fusion_Should_Fail)
+TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = conv2d(conv2d(l0_input, l0_weight), l1_weight)
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto data_layout    = DataLayout::NHWC;
+    const auto t_input_shape  = TensorShape(384, 12, 12);
+    const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+    auto       t_input_info   = TensorInfo(t_input_shape, 1, data_type, data_layout);
+    auto       t_weight_info  = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+    auto       t_dst_info     = TensorInfo();
+
+    Conv2dAttributes conv2d_attr{};
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    // Create tensor infos
+    ITensorInfo *input_info  = context.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+    ITensorInfo *weight_info = context.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+    ITensorInfo *dst_info;
+
+    // Fuse conv2d into the workload
+    {
+        // Validate operator
+        const Status success = GpuConv2d::validate_op(sketch, input_info, weight_info, nullptr, conv2d_attr);
+        ARM_COMPUTE_EXPECT(bool(success), framework::LogLevel::ERRORS);
+
+        dst_info = GpuConv2d::create_op(sketch, input_info, weight_info, nullptr, conv2d_attr);
+    }
+
+    // Create tensor infos
+    ITensorInfo *weight_info_2 = context.create_tensor_info(t_weight_info);
+
+    // Fuse conv2d into the workload
+    {
+        // Validate operator, should fail
+        const Status success          = GpuConv2d::validate_op(sketch, dst_info, weight_info_2, nullptr, conv2d_attr);
+        const auto expected_error_str = "Operator fusion test failed. This operator cannot be fused into the workload";
+
+        ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT((success.error_description().find(expected_error_str) != std::string::npos),
+                           framework::LogLevel::ERRORS);
+    }
+}
+TEST_SUITE_END() // Invalid_Fusion_Should_Fail
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // INTEGRATION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
new file mode 100644
index 0000000000..9bfdc961fe
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/DynamicFusionDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/* Synced with tests/validation/CL/ArithmeticAddition.cpp from the standard interface.
+ *
+ * Difference          | Why the difference
+ * No quantized tests  | Not supported yet
+ * No in place tests   | Not supported yet
+ * No activation tests | Not needed in dynamic fusion interface
+ *
+ */
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(ADD)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // S16 is valid data type for Add
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // S32 is valid data type for Add
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
+                                                        TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32),    // Broadcast Y dimension is not allowed
+                                                        TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::S16),    // Broadcast Z dimension is not allowed
+                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed
+                                                      }),
+               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for rhs
+                                                       TensorInfo(TensorShape(15U,  1U, 3U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape( 3U,  8U, 1U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("Expected", { true, false, true, true, false, true, false, false, true, false, false, true})),
+               input1_info, input2_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Validate Elementwise Add
+    auto          lhs_info         = context.create_tensor_info(input1_info);
+    auto          rhs_info         = context.create_tensor_info(input2_info);
+
+    bool res = bool(GpuAdd::validate_op(sketch, lhs_info, rhs_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+constexpr AbsoluteTolerance<float> tolerance_f(
+    0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 and DataType::F16 */
+constexpr float tolerance_num = 0.0001f; /**< Tolerance number */
+
+template <typename T>
+using DynamicFusionCLAddFixture =
+    DynamicFusionGpuElementwiseBinaryOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+
+template <typename T>
+using DynamicFusionCLAddBroadcastFixture =
+    DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+
+template <typename T>
+using DynamicFusionCLAddTwoOpsFixture =
+    DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLAddFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeOneOp,
+                       DynamicFusionCLAddFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLAddBroadcastFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLAddBroadcastFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::TemporaryLimitedLargeShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f);
+}
+FIXTURE_DATA_TEST_CASE(
+    RunSmallTwoOps,
+    DynamicFusionCLAddTwoOpsFixture<float>,
+    framework::DatasetMode::PRECOMMIT,
+    combine(combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                    datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes()),
+                            framework::dataset::make("DataType", {DataType::F32})),
+                    framework::dataset::make("InPlace", {false})),
+            framework::dataset::make("FuseTwoOps", {true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLAddFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLAddBroadcastFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f, tolerance_num);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLAddFixture<int32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::S32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLAddFixture<int16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::S16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionCLAddFixture<int16_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", {DataType::S16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLAddFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::U8})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
+
+TEST_SUITE_END() // ADD
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Cast.cpp b/tests/validation/dynamic_fusion/gpu/cl/Cast.cpp
new file mode 100644
index 0000000000..4ef359e74d
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Cast.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ConvertPolicyDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/CastFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+// Tolerance
+constexpr AbsoluteTolerance<float> zero_tolerance(0);
+
+/** Input data sets **/
+
+// F16
+const auto CastF16toF32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32));
+
+// F32
+const auto CastF32toF16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
+
+class DFConvertPolicies final : public framework::dataset::ContainerDataset<std::vector<ConvertPolicy>>
+{
+public:
+    DFConvertPolicies()
+        : ContainerDataset("ConvertPolicy",
+    {
+        ConvertPolicy::WRAP
+    })
+    {
+    }
+};
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(CAST)
+
+template <typename T>
+using DynamicFusionCLCastToF16Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, half>;
+template <typename T>
+using DynamicFusionCLCastToF32Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, float>;
+
+#define CAST_SUITE(NAME, idt, odt, type, dataset, tolerance)                                                                     \
+    TEST_SUITE(NAME)                                                                                                             \
+    FIXTURE_DATA_TEST_CASE(RunSmall, type, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), dataset), \
+                                                                                              DFConvertPolicies()))              \
+    {                                                                                                                            \
+        validate(CLAccessor(_target), _reference, tolerance);                                                                    \
+    }                                                                                                                            \
+    TEST_SUITE_END()
+
+// F16
+CAST_SUITE(F16_to_F32, DataType::F16, DataType::F32, DynamicFusionCLCastToF32Fixture<half>, CastF16toF32Dataset, zero_tolerance)
+
+// F32
+CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, DynamicFusionCLCastToF16Fixture<float>, CastF32toF16Dataset, zero_tolerance)
+
+TEST_SUITE_END() // CAST
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Clamp.cpp b/tests/validation/dynamic_fusion/gpu/cl/Clamp.cpp
new file mode 100644
index 0000000000..cef8b87c3f
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Clamp.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/ClampFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr float                    epsilon = 1e-6f;
+constexpr AbsoluteTolerance<float> tolerance(epsilon);
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(CLAMP)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Minimum value larger than maximum value
+                                                    }),
+                framework::dataset::make("MinVal", { 0.2f,
+                                                     1.5f,
+                                                     9.0f,
+                                                    })),
+                framework::dataset::make("MaxVal", { 0.5f,
+                                                     2.0f,
+                                                     1.0f,
+                                                    })),
+                framework::dataset::make("Expected", { true, true, false })),
+                input_info, min_val, max_val, expected)
+{
+    // Create a new workload sketch
+    CLCompileContext cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Fuse Clamp
+    const ITensorInfo* src_info = context.create_tensor_info(input_info);
+
+    ClampAttributes attributes {};
+    attributes.min_val(min_val)
+              .max_val(max_val);
+
+    const bool res = static_cast<bool>(GpuClamp::validate_op(sketch, src_info, attributes));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionClampOpFixture = DynamicFusionClampValidationFixture<CLTensor, CLAccessor, GpuClamp, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionClampOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.1f).max_val(0.6f)})),
+                                       framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionClampOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::Small5dShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.1f).max_val(0.6f)})),
+                                       framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionClampOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.2f).max_val(0.4f)})),
+                                       framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionClampOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.3f).max_val(0.7f)})),
+                                       framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionClampOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::Small5dShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.3f).max_val(0.7f)})),
+                                       framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionClampOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.1f).max_val(0.9f)})),
+                                       framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance);
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // CLAMP
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
new file mode 100644
index 0000000000..2f8c639cea
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/DepthwiseConvolutionLayerDataset.h"
+#include "tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+const auto depth_multipliers       = framework::dataset::make("DepthMultiplier", {1U, 4U});
+const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", {1, 2, 5, 8});
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(DEPTHWISE_CONV2D)
+
+RelativeTolerance<float> tolerance_f32(
+    0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(
+    0.1)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr float                     tolerance_num = 0.02f; /**< Tolerance number */
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(                                                                  // Explanations of failing tests
+                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // Mismatching data type input/weights
+                                                        TensorInfo(TensorShape(3U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // Mismatching input feature maps
+                                                        TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // Mismatching depth multiplier
+                                                        TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // Invalid biases size
+                                                        TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // Invalid biases dimensions
+                                                        TensorInfo(TensorShape(8U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // dilation < 1
+                                                        TensorInfo(TensorShape(8U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC),              // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC),       // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QSYMM16, DataLayout::NHWC),              // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QSYMM8, DataLayout::NHWC),               // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QSYMM8_PER_CHANNEL, DataLayout::NHWC),   // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QASYMM16, DataLayout::NHWC),             // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::U8, DataLayout::NHWC),                   // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::S8, DataLayout::NHWC),                   // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::U16, DataLayout::NHWC),                  // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::S16, DataLayout::NHWC),                  // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::U32, DataLayout::NHWC),                  // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::S32, DataLayout::NHWC),                  // Unsupported data type
+                                                        TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::F32, DataLayout::NCHW),                  // Unsupported data layout
+                                                        TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),              // weight dimension > 3
+                                                        TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                      }),
+                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F16, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(16U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(16U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QASYMM8, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QSYMM16, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QSYMM8, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QSYMM8_PER_CHANNEL, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QASYMM16, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::U8, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::S8, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::U16, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::S16, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::U32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::S32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::F32, DataLayout::NCHW),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U, 5U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 4U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                        })),
+                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(2U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(2U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(2U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(16U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(16U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NCHW),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC),
+                                                       })),
+                framework::dataset::make("Padding", {  Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(2, 1, 2, 1),
+                                                       Padding2D(2, 1, 2, 1),
+                                                       Padding2D(2, 1, 2, 1),
+                                                      })),
+                framework::dataset::make("Stride", {   Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(2, 3),
+                                                       Size2D(2, 3),
+                                                      })),
+                framework::dataset::make("DepthMultiplier", { 1,
+                                                              1,
+                                                              3,
+                                                              1,
+                                                              1,
+                                                              2,
+                                                              2,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                             })),
+                       framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(0U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(2U, 3U),
+                                                             })),
+                framework::dataset::make("Expected", { false, false, false, false, false, false, true, false,
+                                                       false, false, false, false, false, false, false, false, false, false,
+                                                       false, false, true, false, true, true, true })),
+                input_info, weights_info, biases_info, padding, stride, depth_multiplier, dilation, expected)
+{
+    CLCompileContext cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    const ITensorInfo* sketch_input_info   = context.create_tensor_info(input_info);
+    const ITensorInfo* sketch_weights_info = context.create_tensor_info(weights_info);
+    const ITensorInfo* sketch_biases_info  = context.create_tensor_info(biases_info);
+
+    DepthwiseConv2dAttributes attributes {};
+    attributes.pad(padding)
+              .stride(stride)
+              .dilation(dilation)
+              .depth_multiplier(depth_multiplier);
+
+    const Status status = GpuDepthwiseConv2d::validate_op(sketch, sketch_input_info, sketch_weights_info, sketch_biases_info, attributes);
+    const bool res = bool(status);
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionGpuDepthwiseConv2dFixture =
+    DynamicFusionGpuDepthwiseConv2dValidationFixture<CLTensor, CLAccessor, GpuDepthwiseConv2d, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+TEST_SUITE(W3x3)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE(Dilation)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                               depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // Dilation
+TEST_SUITE_END() // W3x3
+
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
+TEST_SUITE(Dilation)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                               depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END() // Dilation
+TEST_SUITE_END() // Generic
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+TEST_SUITE(W3x3)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(Dilation)
+
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                               depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // Dilation
+TEST_SUITE_END() // W3x3
+
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeKernelSize,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset(),
+                                               framework::dataset::make("DepthMultiplier", {1})),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(Dilation)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                               depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // Dilation
+TEST_SUITE_END() // Generic
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // DEPTHWISE_CONV2D
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
new file mode 100644
index 0000000000..b843764786
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tests/AssetsLibrary.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/SmallConvolutionLayerDataset.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/** Tolerances from tests/validation/CL/DirectConvolutionLayer.cpp
+ */
+RelativeTolerance<float> tolerance_f32(
+    0.05f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(
+    0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr float                     abs_tolerance_f32(0.0001f); /**< Absolute tolerance for FP32 tests*/
+constexpr float                     tolerance_num = 0.07f;      /**< Tolerance number */
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+/** Synced with tests/validation/CL/ConvolutionLayer.cpp
+ *
+ * Difference                       | Why the difference
+ * f32 tolerance here is smaller    | To use the same tolerance as that of DirectConv2d; lowering tolerance is safe
+ * No quantized tests               | Not supported yet
+ * No grouped CNN tests             | Not supported yet
+ * No mixed layout tests            | Not needed; only NHWC is supported
+ * No activation                    | Not needed in fusion
+ * No ValidateConvolutionMethod     | Only a single method (direct conv2d) is supported
+ * No ReshapeWeights = true tests   | Not applicable yet. This parameter only concerns gemm-based conv2d
+ * No RunSmallWithPadding tests     | Padding is removed
+ *
+ */
+TEST_SUITE(CONV2D)
+
+template <typename T>
+using DynamicFusionGpuConv2dFixture = DynamicFusionGpuConv2dValidationFixture<CLTensor, CLAccessor, GpuConv2d, T>;
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                               framework::dataset::make("DataType", DataType::F32)),
+                                       framework::dataset::make("DataLayout", {DataLayout::NHWC})),
+                               framework::dataset::make("QuantizationInfo", QuantizationInfo())))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuConv2dFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                               framework::dataset::make("DataType", DataType::F16)),
+                                       framework::dataset::make("DataLayout", {DataLayout::NHWC})),
+                               framework::dataset::make("QuantizationInfo", QuantizationInfo())))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END() // FP16
+
+// Tests for specific conv2d methods
+/** Synced with tests/validation/CL/DirectConvolutionLayer.cpp
+ *
+ * Difference                       | Why the difference
+ * No quantized tests               | Not supported yet
+ * No Invalid output size test      | Not applicable. Output is removed from the interface
+ * No mixed layout/NCHW tests       | Not needed; only NHWC is supported
+ * No activation tests              | Not needed in fusion
+ */
+TEST_SUITE(DIRECT_CONV2D)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Invalid: Mismatching data type input/weights
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Invalid: Mismatching input feature maps
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Invalid weights dimensions
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Unsupported biases size
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Unsupported biases dimensions
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, DataLayout::NCHW),       // Unsupported data layout: NCHW
+                                                       TensorInfo(TensorShape(2U, 32U, 16U), 1, DataType::QASYMM8, DataLayout::NHWC),   // Unsupported data type: quantized
+                                                       TensorInfo(TensorShape(2U, 32U, 16U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Arbitrary weight sizes for NHWC are supported
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Non-rectangular weights dimensions for NHWC are supported
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Strides > 2 for any kernel sizes for NHWC are supported
+                                                     }),
+               framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(2U, 3U, 3U, 4U), 1, DataType::F16, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(3U, 3U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 3U, 3U, 4U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 3U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 3U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32, DataLayout::NCHW),
+                                                        TensorInfo(TensorShape(2U, 1U, 1U, 4U), 1, DataType::QASYMM8, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 1U, 1U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 13U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 5U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 3U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                     })),
+               framework::dataset::make("BiasesInfo",{ TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(25U), 1, DataType::F32, DataLayout::NCHW),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::QASYMM8, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                     })),
+               framework::dataset::make("Conv2dAttributes",  {
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({3, 3}).pad({0, 0, 0, 0}),
+                                                      })),
+               framework::dataset::make("Expected", { false, false, false, false, false, false, false, true, true, true, true })),
+               input_info, weights_info, biases_info, conv2d_attrs, expected)
+{
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    const ITensorInfo* sketch_input_info   = context.create_tensor_info(input_info);
+    const ITensorInfo* sketch_weights_info = context.create_tensor_info(weights_info);
+    const ITensorInfo* sketch_biases_info  = context.create_tensor_info(biases_info);
+    bool is_valid = bool(GpuConv2d::validate_op(sketch, sketch_input_info, sketch_weights_info, sketch_biases_info, conv2d_attrs));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+template <typename T>
+using DynamicFusionGpuDirectConv2dFixture = DynamicFusionDirectConv2dValidationFixture<CLTensor, CLAccessor, GpuConv2d, T>;
+
+TEST_SUITE(FP16)
+/// TODO: COMPMID-6877: Once the issue in Conv2d is resolved, re-enable these
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDirectConv2dFixture<half>, framework::DatasetMode::DISABLED,
+               combine(combine(combine(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
+               framework::dataset::make("StrideX", { 1, 3, 1, 1 })),
+               framework::dataset::make("StrideY", { 1, 3, 2, 1 })),
+               framework::dataset::make("PadX", { 1, 3, 0, 4 })),
+               framework::dataset::make("PadY", { 1, 3, 0, 4 })),
+               framework::dataset::make("KernelSize", { 3, 8, 1, 9 })),
+               framework::dataset::make("NumKernels", { 17, 3, 1, 19 })),
+               framework::dataset::make("DataType",  DataType::F16)),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDirectConv2dFixture<half>, framework::DatasetMode::NIGHTLY,
+               combine(combine(combine(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ),
+               framework::dataset::make("StrideX", { 1 })),
+               framework::dataset::make("StrideY", { 1 })),
+               framework::dataset::make("PadX", { 1 })),
+               framework::dataset::make("PadY", { 1 })),
+               framework::dataset::make("KernelSize", { 9 })),
+               framework::dataset::make("NumKernels", { 3 })),
+               framework::dataset::make("DataType",  DataType::F16)),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+/// TODO: COMPMID-6877: Once the issue in Conv2d is resolved, re-enable these
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDirectConv2dFixture<float>, framework::DatasetMode::DISABLED,
+               combine(combine(combine(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
+               framework::dataset::make("StrideX", { 1, 3, 1, 1 })),
+               framework::dataset::make("StrideY", { 1, 3, 2, 1 })),
+               framework::dataset::make("PadX", { 1, 3, 0, 4 })),
+               framework::dataset::make("PadY", { 1, 3, 0, 4 })),
+               framework::dataset::make("KernelSize", { 3, 8, 1, 9 })),
+               framework::dataset::make("NumKernels", { 17, 3, 1, 19 })),
+               framework::dataset::make("DataType",  DataType::F32)),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.0, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDirectConv2dFixture<float>, framework::DatasetMode::NIGHTLY,
+               combine(combine(combine(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ),
+               framework::dataset::make("StrideX", { 1 })),
+               framework::dataset::make("StrideY", { 1 })),
+               framework::dataset::make("PadX", { 1 })),
+               framework::dataset::make("PadY", { 1 })),
+               framework::dataset::make("KernelSize", { 9 })),
+               framework::dataset::make("NumKernels", { 3 })),
+               framework::dataset::make("DataType",  DataType::F32)),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.0, abs_tolerance_f32);
+}
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // DIRECT_CONV2D
+TEST_SUITE_END() // CONV2D
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
new file mode 100644
index 0000000000..82d66ca6ce
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tests/AssetsLibrary.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/MatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h"
+#include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/Validation.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> tolerance_f32(
+    0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+constexpr float abs_tolerance_f32(
+    0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for floating point data types in case using relative tolerance fails because of small values */
+constexpr float abs_tolerance_f16(
+    0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data types in case using relative tolerance fails because of small values */
+RelativeTolerance<half_float::half> tolerance_f16(half(
+    0.02)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+} // namespace
+
+/** M0 values to test - precommit */
+const auto m0_values_lhs_nt_precommit = framework::dataset::make("M0", {1, 2, 3});
+
+/** N0 values to test - precommit */
+const auto n0_values_rhs_t_precommit = framework::dataset::make("N0", {1, 2, 4});
+
+/** K0 values to test - precommit */
+const auto k0_values_rhs_t_precommit = framework::dataset::make("K0", {1, 2, 4});
+
+/** M0 values to test - nightly */
+const auto m0_values_lhs_nt_nightly = framework::dataset::make("M0", {1, 2, 3, 4});
+
+/** N0 values to test - nightly */
+const auto n0_values_rhs_t_nightly = framework::dataset::make("N0", {1, 2, 3, 4, 8});
+
+/** K0 values to test - nightly */
+const auto k0_values_rhs_t_nightly = framework::dataset::make("K0", {1, 2, 3, 4, 8});
+
+class DFMatMulDataset final : public datasets::MatMulDataset
+{
+public:
+    DFMatMulDataset()
+    {
+        // LHS = [K, M], RHS = [N, K], DST = [N, M]
+        add_config(TensorShape(1U, 1U), TensorShape(1U, 1U), TensorShape(1U, 1U));
+        add_config(TensorShape(1U, 2U), TensorShape(2U, 1U), TensorShape(2U, 2U));
+        add_config(TensorShape(9U, 6U), TensorShape(5U, 9U), TensorShape(5U, 6U));
+        add_config(TensorShape(32U, 37U), TensorShape(17U, 32U), TensorShape(17U, 37U));
+    }
+};
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+
+TEST_SUITE(MatMul)
+
+TEST_SUITE(Validate)
+TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL)
+{
+    using MatMulConfigurationPair = std::pair<MatMulKernelInfo, bool>;
+
+    const std::vector<MatMulConfigurationPair> supported_block_sizes = {
+        // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false)
+
+        // Lhs not-transposed, Rhs transposed
+        {MatMulKernelInfo(false, true, 0, 1, 1), false},  // M0 should be > 0
+        {MatMulKernelInfo(false, true, 3, 11, 1), false}, // N0 not in {1, 2, 3, 4, 8, 16}
+        {MatMulKernelInfo(false, true, 3, 7, 1), false},  // N0 not in {1, 2, 3, 4, 8, 16}
+        {MatMulKernelInfo(false, true, 3, 3, 12), false}, // K0 not in {1, 2, 3, 4, 8, 16}
+        {MatMulKernelInfo(false, true, 3, 3, 6), false},  // K0 not in {1, 2, 3, 4, 8, 16}
+        {MatMulKernelInfo(false, true, 5, 1, 2), true},   {MatMulKernelInfo(false, true, 3, 3, 3), true},
+        {MatMulKernelInfo(false, true, 2, 4, 8), true},
+
+    };
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal
+    // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here,
+    // not the shapes themselves.
+    const ITensorInfo *lhs_info = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32));
+    const ITensorInfo *rhs_info = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32));
+
+    for (auto &pair : supported_block_sizes)
+    {
+        MatMulAttributes matmul_attr{};
+        matmul_attr.adj_lhs(pair.first.adj_lhs);
+        matmul_attr.adj_rhs(pair.first.adj_rhs);
+
+        GpuMatMulSettings matmul_settings{};
+        matmul_settings.m0(pair.first.m0);
+        matmul_settings.n0(pair.first.n0);
+        matmul_settings.k0(pair.first.k0);
+
+        Status status = GpuMatMul::validate_op(sketch, lhs_info, rhs_info, matmul_attr, matmul_settings);
+        ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
+{
+    // Create a sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using ShapeConfigurationTuple                                   = std::tuple<TensorShape, TensorShape, bool>;
+    const std::vector<ShapeConfigurationTuple> shape_configurations = {
+        {TensorShape(5U, 1U), TensorShape(3U, 5U), true},
+        {TensorShape(10U, 12U), TensorShape(3U, 10U), true},
+        {TensorShape(8U, 4U), TensorShape(2U, 8U), true},
+        {TensorShape(8U, 4U), TensorShape(2U, 5U), false}, // Mismatch in the K dimension
+        {TensorShape(5U, 0U), TensorShape(2U, 5U), false}, // Invalid dimension
+        {TensorShape(5U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), true},
+        {TensorShape(5U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), false}, // no batch broadcasting
+        {TensorShape(5U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U),
+         false}, // mismatch in batch dimension
+    };
+
+    for (auto &tuple : shape_configurations)
+    {
+        const bool expected = std::get<2>(tuple);
+
+        for (bool adj_lhs : {false})
+        {
+            for (bool adj_rhs : {true})
+            {
+                TensorShape lhs_shape = std::get<0>(tuple);
+                TensorShape rhs_shape = std::get<1>(tuple);
+
+                if (adj_lhs)
+                {
+                    permute(lhs_shape, PermutationVector(1U, 0U));
+                }
+
+                if (adj_rhs)
+                {
+                    permute(rhs_shape, PermutationVector(1U, 0U));
+                }
+
+                const ITensorInfo *lhs_info = context.create_tensor_info(TensorInfo(lhs_shape, 1, DataType::F32));
+                const ITensorInfo *rhs_info = context.create_tensor_info(TensorInfo(rhs_shape, 1, DataType::F32));
+
+                MatMulAttributes matmul_attr{};
+                matmul_attr.adj_lhs(adj_lhs);
+                matmul_attr.adj_rhs(adj_rhs);
+
+                GpuMatMulSettings matmul_settings{};
+                matmul_settings.m0(1);
+                matmul_settings.n0(1);
+                matmul_settings.k0(1);
+
+                Status status = GpuMatMul::validate_op(sketch, lhs_info, rhs_info, matmul_attr, matmul_settings);
+                ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+            }
+        }
+    }
+}
+
+TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
+{
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using DataTypeConfigurationTuple = std::tuple<DataType, DataType, DataType, bool>;
+    const std::vector<DataTypeConfigurationTuple> data_type_configurations = {
+        {DataType::F32, DataType::F32, DataType::F32, true},
+        {DataType::F16, DataType::F16, DataType::F16, true},
+        {DataType::F16, DataType::F32, DataType::F32, false},                                  // no mixed precision
+        {DataType::F64, DataType::F64, DataType::F64, false},                                  // no double precision
+        {DataType::QASYMM8, DataType::QASYMM8, DataType::QASYMM8, false},                      // no quantized types
+        {DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, false}, // no quantized types
+        {DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL,
+         false},                                                             // no quantized types
+        {DataType::QASYMM16, DataType::QASYMM16, DataType::QASYMM16, false}, // no quantized types
+        {DataType::QSYMM16, DataType::QSYMM16, DataType::QSYMM16, false},    // no quantized types
+        {DataType::QSYMM8, DataType::QSYMM8, DataType::QSYMM8, false},       // no quantized types
+        {DataType::S64, DataType::S64, DataType::S64, false},                // no integral types
+        {DataType::S32, DataType::S32, DataType::S32, false},                // no integral types
+        {DataType::S16, DataType::S16, DataType::S16, false},                // no integral types
+        {DataType::S8, DataType::S8, DataType::S8, false},                   // no integral types
+        {DataType::U64, DataType::U64, DataType::U64, false},                // no integral types
+        {DataType::U32, DataType::U32, DataType::U32, false},                // no integral types
+        {DataType::U16, DataType::U16, DataType::U16, false},                // no integral types
+        {DataType::U8, DataType::U8, DataType::U8, false},                   // no integral types
+    };
+    // Create a sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    const TensorShape shape = TensorShape(10U, 10U);
+    MatMulAttributes  matmul_attr{};
+    matmul_attr.adj_lhs(false);
+    matmul_attr.adj_rhs(false);
+    GpuMatMulSettings matmul_settings{};
+    matmul_settings.m0(1);
+    matmul_settings.n0(1);
+    matmul_settings.k0(1);
+
+    for (auto &tuple : data_type_configurations)
+    {
+        const bool expected = std::get<3>(tuple);
+
+        const ITensorInfo *lhs_info = context.create_tensor_info(TensorInfo(shape, 1, std::get<0>(tuple)));
+        const ITensorInfo *rhs_info = context.create_tensor_info(TensorInfo(shape, 1, std::get<1>(tuple)));
+
+        Status status = GpuMatMul::validate_op(sketch, lhs_info, rhs_info, matmul_attr, matmul_settings);
+        ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_SUITE_END() // Validate
+
+template <typename T>
+using DynamicFusionGpuMatmulFixture = DynamicFusionGpuMatMulValidationFixture<CLTensor, CLAccessor, GpuMatMul, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunPrecommit,
+                       DynamicFusionGpuMatmulFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_precommit,
+                               n0_values_rhs_t_precommit,
+                               k0_values_rhs_t_precommit,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunNightly,
+                       DynamicFusionGpuMatmulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_nightly,
+                               n0_values_rhs_t_nightly,
+                               k0_values_rhs_t_nightly,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunPrecommit,
+                       DynamicFusionGpuMatmulFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_precommit,
+                               n0_values_rhs_t_precommit,
+                               k0_values_rhs_t_precommit,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunNightly,
+                       DynamicFusionGpuMatmulFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_nightly,
+                               n0_values_rhs_t_nightly,
+                               k0_values_rhs_t_nightly,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // MatMul
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp b/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
new file mode 100644
index 0000000000..af02ce3eaa
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/DynamicFusionDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/MulFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/* Synced with tests/validation/CL/PixelwiseMultiplication.cpp from the standard interface.
+ *
+ * Difference              | Why the difference
+ * No integer tests        | Not supported yet
+ * No quantized tests      | Not supported yet
+ * No convert policy tests | Not needed as convert policy is ignored by floating types
+ * No scale tests          | Not supported yet
+ * No rounding modes tests | Not supported yet
+ * No in place tests       | Not supported yet
+ * No activation tests     | Not needed in dynamic fusion interface
+ *
+ */
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_f16(
+    0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr AbsoluteTolerance<float> tolerance_f32(
+    0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+} // namespace
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(MUL)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),     // Unsupported data type U8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),     // Unsupported data type S8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // Unsupported data type S16
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // Unsupported data type S32
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8_SIGNED
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
+                                                        TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32),    // Broadcast Y dimension is not allowed
+                                                        TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::F32),    // Broadcast Z dimension is not allowed
+                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed
+                                                      }),
+               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for rhs
+                                                       TensorInfo(TensorShape(15U,  1U, 3U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape( 3U,  8U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("Expected", { true, true, false, false, false, false, false, false, false, false, true, true, false, false, true })),
+               input1_info, input2_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Validate Elementwise Mul
+    auto          lhs_info         = context.create_tensor_info(input1_info);
+    auto          rhs_info         = context.create_tensor_info(input2_info);
+
+    bool res = bool(GpuMul::validate_op(sketch, lhs_info, rhs_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionCLMulFixture = DynamicFusionMulOneOpValidationFixture<CLTensor, CLAccessor, GpuMul, T>;
+template <typename T>
+using DynamicFusionCLMulBroadcastFixture = DynamicFusionMulBroadcastValidationFixture<CLTensor, CLAccessor, GpuMul, T>;
+template <typename T>
+using DynamicFusionCLMulTwoOpsFixture = DynamicFusionMulTwoOpsValidationFixture<CLTensor, CLAccessor, GpuMul, T>;
+
+TEST_SUITE(F16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLMulFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::TemporaryLimitedSmallShapesBroadcast(),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::TemporaryLimitedLargeShapesBroadcast(),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // F16
+
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLMulFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeOneOp,
+                       DynamicFusionCLMulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::TemporaryLimitedSmallShapesBroadcast(),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::TemporaryLimitedLargeShapesBroadcast(),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionCLMulTwoOpsFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes(),
+                                               framework::dataset::make("DataType", {DataType::F32})),
+                                       framework::dataset::make("InPlace", {false})),
+                               framework::dataset::make("FuseTwoOps", {true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // F32
+
+TEST_SUITE_END() // MUL
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
new file mode 100644
index 0000000000..be816b32b3
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/dynamic_fusion/PoolingLayerDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(POOL2D)
+
+constexpr AbsoluteTolerance<float> tolerance_f32(
+    0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+constexpr AbsoluteTolerance<float> tolerance_f16(
+    0.01f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+
+const auto PoolingLayerDatasetFP =
+    combine(combine(combine(combine(framework::dataset::make("PoolingType", {PoolingType::MAX, PoolingType::AVG}),
+                                    framework::dataset::make("PoolingSize", {Size2D(2, 2), Size2D(3, 3)})),
+                            framework::dataset::make("Pad", {Padding2D()})),
+                    framework::dataset::make("Stride", {Size2D(1, 1), Size2D(2, 1), Size2D(5, 7)})),
+            framework::dataset::make("ExcludePadding", {true}));
+
+template <typename T>
+using DynamicFusionGpuPool2dFixture = DynamicFusionGpuPool2dValidationFixture<CLTensor, CLAccessor, GpuPool2d, T>;
+
+template <typename T>
+using DFSpecialGpuPool2dFixture = DynamicFusionGpuPool2dSpecialValidationFixture<CLTensor, CLAccessor, GpuPool2d, T>;
+// *INDENT-OFF*
+// clang-format off
+
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+            framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC), // Invalid parameters, unsupported pooling
+                                                    TensorInfo(TensorShape(5U, 15U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Valid Non-rectangular Global Pooling
+                                                    TensorInfo(TensorShape(5U, 13U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC), // Invalid - Quantized not supported.
+                                                    TensorInfo(TensorShape(5U, 13U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Valid global pooling
+                                                    TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32, DataLayout::NCHW),     // Unsupported data layout
+                                                }),
+            framework::dataset::make("Pool2dAttributes", {
+                                                    Pool2dAttributes().pool_type(PoolingType::L2).pool_size(Size2D(3,3)).pad(Padding2D(0,0,0,0)).stride(Size2D(1,1)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(15U, 13U)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(2,2)).pad(Padding2D()).stride(Size2D(1,1)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(13U,13U)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(13U,13U)),
+                                                })),
+            framework::dataset::make("Expected", { false, true, false, true, false })),
+            input_info, pool2d_attr, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Declare GpuPool2d settings
+    const GpuPool2dSettings &settings = GpuPool2dSettings();
+
+    // Validate Pool2d Configuration
+    auto                   src_info    = context.create_tensor_info(input_info);
+    bool                   res         = bool(GpuPool2d::validate_op(sketch, src_info, pool2d_attr, settings));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuPool2dFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallNoneUnitShapes(), PoolingLayerDatasetFP),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuPool2dFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::LargeShapes(), PoolingLayerDatasetFP),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSpecial,
+                       DFSpecialGpuPool2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::PoolingLayerDatasetSpecialDynamicFusion(),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(GlobalPooling)
+FIXTURE_DATA_TEST_CASE(
+    RunSmall,
+    DynamicFusionGpuPool2dFixture<float>,
+    framework::DatasetMode::ALL,
+    combine(combine(combine(combine(combine(combine(framework::dataset::make("InputShape",
+                                                                             {TensorShape(27U, 13U, 2U),
+                                                                              TensorShape(27U, 13U, 2U, 4U)}),
+                                                    framework::dataset::make("PoolingType",
+                                                                             {PoolingType::AVG, PoolingType::MAX})),
+                                            framework::dataset::make("PoolingSize", {Size2D(27, 13)})),
+                                    framework::dataset::make("Pad", {Padding2D()})),
+                            framework::dataset::make("Stride", {Size2D(1, 1)})),
+                    framework::dataset::make("ExcludePadding", true)),
+            framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(
+    RunLarge,
+    DynamicFusionGpuPool2dFixture<float>,
+    framework::DatasetMode::NIGHTLY,
+    combine(combine(combine(combine(combine(combine(framework::dataset::make("InputShape",
+                                                                             {TensorShape(79U, 37U, 11U),
+                                                                              TensorShape(79U, 37U, 11U, 4U)}),
+                                                    framework::dataset::make("PoolingType",
+                                                                             {PoolingType::AVG, PoolingType::MAX})),
+                                            framework::dataset::make("PoolingSize", {Size2D(79, 37)})),
+                                    framework::dataset::make("Pad", {Padding2D()})),
+                            framework::dataset::make("Stride", {Size2D(1, 1)})),
+                    framework::dataset::make("ExcludePadding", true)),
+            framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+TEST_SUITE(GlobalPooling)
+FIXTURE_DATA_TEST_CASE(
+    RunSmall,
+    DynamicFusionGpuPool2dFixture<half>,
+    framework::DatasetMode::ALL,
+    combine(combine(combine(combine(combine(combine(framework::dataset::make("InputShape",
+                                                                             {TensorShape(27U, 13U, 2U),
+                                                                              TensorShape(27U, 13U, 2U, 4U)}),
+                                                    framework::dataset::make("PoolingType",
+                                                                             {PoolingType::AVG, PoolingType::MAX})),
+                                            framework::dataset::make("PoolingSize", {Size2D(27, 13)})),
+                                    framework::dataset::make("Pad", {Padding2D()})),
+                            framework::dataset::make("Stride", {Size2D(1, 1)})),
+                    framework::dataset::make("ExcludePadding", true)),
+            framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(
+    RunLarge,
+    DynamicFusionGpuPool2dFixture<half>,
+    framework::DatasetMode::NIGHTLY,
+    combine(combine(combine(combine(combine(combine(framework::dataset::make("InputShape",
+                                                                             {TensorShape(79U, 37U, 11U),
+                                                                              TensorShape(79U, 37U, 11U, 4U)}),
+                                                    framework::dataset::make("PoolingType",
+                                                                             {PoolingType::AVG, PoolingType::MAX})),
+                                            framework::dataset::make("PoolingSize", {Size2D(79, 37)})),
+                                    framework::dataset::make("Pad", {Padding2D()})),
+                            framework::dataset::make("Stride", {Size2D(1, 1)})),
+                    framework::dataset::make("ExcludePadding", true)),
+            framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // FLOAT
+
+TEST_SUITE_END() // POOL2D
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp b/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
new file mode 100644
index 0000000000..d46754ccca
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ReshapeLayerDataset.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/ReshapeFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(RESHAPE)
+
+DATA_TEST_CASE(Validate,
+               framework::DatasetMode::DISABLED,
+               zip(zip(framework::dataset::make(
+                           "InputInfo",
+                           {
+                               TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),
+                               TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
+                               TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32) /*mismatching dimensions*/,
+                           }),
+                       framework::dataset::make("OutputShape",
+                                                {
+                                                    TensorShape(9U, 5U, 21U),
+                                                    TensorShape(8U, 24U, 4U),
+                                                    TensorShape(192U, 192U),
+                                                })),
+                   framework::dataset::make("Expected", {true, true, false})),
+               input_info,
+               output_shape,
+               expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    // Create sketch tensors
+    TensorShape input_shape = input_info.tensor_shape();
+    ARM_COMPUTE_UNUSED(input_shape);
+    ITensorInfo *src_info = context.create_tensor_info(input_info);
+
+    ReshapeAttributes attributes;
+    attributes.shape(output_shape);
+    Status status = GpuReshape::validate_op(sketch, src_info, attributes);
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+
+template <typename T>
+using DynamicFusionGpuReshapeLayerFixture =
+    DynamicFusionGpuReshapeLayerValidationFixture<CLTensor, CLAccessor, GpuReshape, T>;
+
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuReshapeLayerFixture<float>,
+                       framework::DatasetMode::DISABLED,
+                       combine(datasets::SmallReshapeLayerDataset(),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // F32
+
+TEST_SUITE(F16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuReshapeLayerFixture<half>,
+                       framework::DatasetMode::DISABLED,
+                       combine(datasets::SmallReshapeLayerDataset(),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // F16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuReshapeLayerFixture<uint8_t>,
+                       framework::DatasetMode::DISABLED,
+                       combine(datasets::SmallReshapeLayerDataset(),
+                               framework::dataset::make("DataType", DataType::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
+
+TEST_SUITE(S8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuReshapeLayerFixture<int8_t>,
+                       framework::DatasetMode::DISABLED,
+                       combine(datasets::SmallReshapeLayerDataset(),
+                               framework::dataset::make("DataType", DataType::S8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S8
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuReshapeLayerFixture<int16_t>,
+                       framework::DatasetMode::DISABLED,
+                       combine(datasets::SmallReshapeLayerDataset(),
+                               framework::dataset::make("DataType", DataType::S16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE_END() // RESHAPE
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp b/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
new file mode 100644
index 0000000000..a6bcf4ae26
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
@@ -0,0 +1,359 @@
+/*
+* Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ScaleValidationDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/ResizeFixture.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+using datasets::ScaleAlignCornersSamplingPolicySet;
+using datasets::ScaleInterpolationPolicySet;
+using datasets::ScaleSamplingPolicySet;
+using datasets::ScaleShapesBaseDataSet;
+
+/** We consider vector size in byte 16 since the maximum size of
+ * a vector used by @ref CLScaleKernel is currently 16-byte (float4).
+ */
+constexpr uint32_t vector_byte = 16;
+
+template <typename T>
+constexpr uint32_t num_elements_per_vector()
+{
+    return vector_byte / sizeof(T);
+}
+
+/** Quantization information data set */
+const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
+                                                          {
+                                                              QuantizationInfo(0.5f, -1),
+                                                          });
+
+/** Tolerance */
+constexpr float tolerance_f32_absolute(0.001f);
+
+RelativeTolerance<float> tolerance_f32(0.05);
+constexpr float          abs_tolerance_f16(0.1f);
+RelativeTolerance<half>  tolerance_f16(half(0.1));
+
+constexpr float tolerance_num_f32(0.01f);
+
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(RESIZE)
+
+TEST_SUITE(Validate)
+
+const auto default_input_shape  = TensorShape{2, 3, 3, 2};
+const auto default_output_shape = TensorShape{4, 6, 3, 2};
+
+constexpr auto default_data_type   = DataType::U8;
+constexpr auto default_data_layout = DataLayout::NHWC;
+
+TEST_CASE(NullPtr, framework::DatasetMode::ALL)
+{
+    const TensorInfo input_info  = TensorInfo{default_input_shape, 1, default_data_type, default_data_layout};
+    const TensorInfo output_info = TensorInfo{default_output_shape, 1, default_data_type, default_data_layout};
+
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch  sketch{&context};
+
+    // nullptr is given as input
+    Status status = GpuResize::validate_op(sketch, nullptr, ResizeAttributes());
+    ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
+{
+    const std::map<DataType, bool> supported_data_types =
+    {
+        { DataType::U8, false },
+        { DataType::S8, false },
+        { DataType::QSYMM8, false },
+        { DataType::QASYMM8, false },
+        { DataType::QASYMM8_SIGNED, false },
+        { DataType::QSYMM8_PER_CHANNEL, false },
+        { DataType::U16, false },
+        { DataType::S16, false },
+        { DataType::QSYMM16, false },
+        { DataType::QASYMM16, false },
+        { DataType::U32, false },
+        { DataType::S32, false },
+        { DataType::U64, false },
+        { DataType::S64, false },
+        { DataType::BFLOAT16, false },
+        { DataType::F16, true },
+        { DataType::F32, true },
+        { DataType::F64, false },
+        { DataType::SIZET, false },
+    };
+
+    for (auto &kv : supported_data_types)
+    {
+        const TensorInfo input_info = TensorInfo{default_input_shape, 1, kv.first, default_data_layout};
+
+        CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch  sketch{&context};
+
+        const ITensorInfo *sketch_input_info = context.create_tensor_info(input_info);
+
+        ResizeAttributes attributes;
+        attributes.output_width(default_output_shape[0]); // shape is not important unless it's empty
+        attributes.output_height(default_output_shape[1]);
+
+        Status status = GpuResize::validate_op(sketch, sketch_input_info, attributes);
+        ARM_COMPUTE_EXPECT(bool(status) == kv.second, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_CASE(MismatchingDataType, framework::DatasetMode::ALL)
+{
+    constexpr DataType non_default_data_type = DataType::F32;
+
+    const TensorInfo input_info  = TensorInfo{default_input_shape, 1, default_data_type, default_data_layout};
+    const TensorInfo output_info = TensorInfo{default_output_shape, 1, non_default_data_type, default_data_layout};
+
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch  sketch{&context};
+
+    const ITensorInfo *sketch_input_info = context.create_tensor_info(input_info);
+
+    Status status = GpuResize::validate_op(sketch, sketch_input_info, ResizeAttributes());
+    ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL)
+{
+    // Aligned corners require sampling policy to be TOP_LEFT.
+    constexpr InterpolationPolicy interpolation_policy = InterpolationPolicy::BILINEAR;
+    constexpr bool                align_corners        = true;
+    constexpr SamplingPolicy      sampling_policy      = SamplingPolicy::CENTER;
+
+    const TensorInfo input_info  = TensorInfo{default_input_shape, 1, default_data_type, default_data_layout};
+    const TensorInfo output_info = TensorInfo{default_output_shape, 1, default_data_type, default_data_layout};
+
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch  sketch{&context};
+
+    const ITensorInfo *sketch_input_info = context.create_tensor_info(input_info);
+
+    ResizeAttributes attributes{};
+    attributes.interpolation_policy(interpolation_policy).sampling_policy(sampling_policy).align_corners(align_corners);
+
+    Status status = GpuResize::validate_op(sketch, sketch_input_info, attributes);
+    ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(UnsupportedInterpolationPolicy, framework::DatasetMode::ALL)
+{
+    const TensorInfo input_info  = TensorInfo{TensorShape(28U, 33U, 2U), 1, DataType::F32, default_data_layout};
+    const TensorInfo output_info = TensorInfo{TensorShape(26U, 21U, 2U), 1, DataType::F32, default_data_layout};
+    constexpr auto   interpolation_policy = InterpolationPolicy::AREA;
+
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch  sketch{&context};
+
+    const ITensorInfo *sketch_input_info = context.create_tensor_info(input_info);
+
+    ResizeAttributes attributes{};
+    attributes.interpolation_policy(interpolation_policy);
+
+    Status status = GpuResize::validate_op(sketch, sketch_input_info, attributes);
+    ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(UnsupportedLayout, framework::DatasetMode::ALL)
+{
+    const TensorInfo input_info           = TensorInfo{default_input_shape, 1, default_data_type, DataLayout::NCHW};
+    const TensorInfo output_info          = TensorInfo{default_output_shape, 1, default_data_type, DataLayout::NCHW};
+    constexpr auto   interpolation_policy = InterpolationPolicy::BILINEAR;
+
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch  sketch{&context};
+
+    const ITensorInfo *sketch_input_info = context.create_tensor_info(input_info);
+
+    ResizeAttributes attributes{};
+    attributes.interpolation_policy(interpolation_policy);
+
+    Status status = GpuResize::validate_op(sketch, sketch_input_info, attributes);
+    ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE_END() // Validate
+
+template <typename T>
+using DynamicFusionResizeFixture = DynamicFusionResizeValidationFixture<CLTensor, CLAccessor, GpuResize, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+const auto f32_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<float>())),
+                               framework::dataset::make("DataType", DataType::F32));
+
+FIXTURE_DATA_TEST_CASE(Run,
+                       DynamicFusionResizeFixture<float>,
+                       framework::DatasetMode::ALL,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f32_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
+
+FIXTURE_DATA_TEST_CASE(RunAlignCorners,
+                       DynamicFusionResizeFixture<float>,
+                       framework::DatasetMode::ALL,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f32_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
+const auto f32_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<float>())),
+                                       framework::dataset::make("DataType", DataType::F32));
+FIXTURE_DATA_TEST_CASE(RunNightly,
+                       DynamicFusionResizeFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f32_nightly_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
+FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners,
+                       DynamicFusionResizeFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f32_nightly_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+const auto f16_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<half>())),
+                               framework::dataset::make("DataType", DataType::F16));
+FIXTURE_DATA_TEST_CASE(Run,
+                       DynamicFusionResizeFixture<half>,
+                       framework::DatasetMode::ALL,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f16_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunAlignCorners,
+                       DynamicFusionResizeFixture<half>,
+                       framework::DatasetMode::ALL,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f16_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+const auto f16_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<half>())),
+                                       framework::dataset::make("DataType", DataType::F16));
+FIXTURE_DATA_TEST_CASE(RunNightly,
+                       DynamicFusionResizeFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f16_nightly_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners,
+                       DynamicFusionResizeFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f16_nightly_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // RESIZE
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Sigmoid.cpp b/tests/validation/dynamic_fusion/gpu/cl/Sigmoid.cpp
new file mode 100644
index 0000000000..0134a7c11b
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Sigmoid.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/ActivationFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_f32(1e-6f);
+constexpr AbsoluteTolerance<float> tolerance_f16(0.001f);
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(SIGMOID)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8), // Unsupported data type
+                                                    }),
+                framework::dataset::make("Expected", { true, true, false })),
+                input_info, expected)
+{
+    // Create a new workload sketch
+    CLCompileContext cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Fuse sigmoid
+    const ITensorInfo *src_info = context.create_tensor_info(input_info);
+
+    const bool res = static_cast<bool>(GpuSigmoid::validate_op(sketch, src_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionSigmoidOpFixture = DynamicFusionSigmoidValidationFixture<CLTensor, CLAccessor, GpuSigmoid, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionSigmoidOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionSigmoidOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::Small5dShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionSigmoidOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionSigmoidOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionSigmoidOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::Small5dShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionSigmoidOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // SIGMOID
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp b/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
new file mode 100644
index 0000000000..8f5a1ed14a
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/SoftmaxFixture.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Tolerance for float operations */
+RelativeTolerance<half>  tolerance_f16(half(0.2));
+RelativeTolerance<float> tolerance_f32(0.001f);
+
+using framework::dataset::make;
+
+/// TODO: COMPMID-6713
+/// Softmax is not implemented in CKW. Therefore, the tests are DISABLED.
+/// Enable the tests when Softmax is implemented in CKW.
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(SOFTMAX)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::DISABLED,
+    zip(
+        make("InputInfo", {
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::S32), // Unsupported data type
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F16),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+        }),
+        make("OutputInfo",{
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+            TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM16), // Unsupported data type
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+        }),
+        make("beta", {
+            1.0,
+            2.0,
+            2.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+        }),
+        make("axis", {
+            0,
+            0,
+            1,  // Invalid as axis != 0
+            0,
+            0,
+            0,
+            -3, // Invalid as axis != 0
+            2,  // Invalid as axis != 0
+            1,  // Invalid as axis != 0
+            -1, // Invalid as axis != 0
+        }),
+        make("Expected", { false, false, false, true, false, false, false, false, false, false})),
+        input_info, output_info, beta, axis, expected)
+{
+    // Create a new workload sketch
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch  sketch{ &context };
+
+    SoftmaxAttributes softmax_attr{};
+    softmax_attr.axis(axis).beta(beta).is_log_softmax(false);
+    ITensorInfo* src_info  = context.create_tensor_info(input_info);
+    ITensorInfo* dst_info = context.create_tensor_info(output_info);
+    const bool res = static_cast<bool>(GpuSoftmax::validate_op(sketch, src_info, dst_info, softmax_attr));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+
+template <typename T>
+using DynamicFusionSoftmaxLayerFixture = DynamicFusionSoftmaxValidationFixture<CLTensor, CLAccessor, GpuSoftmax, T>;
+
+TEST_SUITE(FLOAT)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+
+FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayer4DShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayer4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // FLOAT
+
+TEST_SUITE_END() // SOFTMAX
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp b/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
new file mode 100644
index 0000000000..c7ab1e717c
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/DynamicFusionDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/* Synced with tests/validation/CL/ArithmeticSubtraction.cpp from the standard interface.
+ *
+ * Difference          | Why the difference
+ * No quantized tests  | Not supported yet
+ * No in place tests   | Not supported yet
+ * No activation tests | Not needed in dynamic fusion interface
+ *
+ */
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(SUB)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U32),    // Unsupported data type U32
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
+                                                        TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32),    // Broadcast Y dimension is not allowed
+                                                        TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::S16),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed
+                                                      }),
+               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for rhs
+                                                       TensorInfo(TensorShape(15U,  1U, 3U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape( 3U,  8U, 1U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("Expected", { true, false, false, false, false, false, false, false, true, true, false, false, true })),
+               input1_info, input2_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Validate Elementwise Sub
+    auto          lhs_info         = context.create_tensor_info(input1_info);
+    auto          rhs_info         = context.create_tensor_info(input2_info);
+
+    bool res = bool(GpuSub::validate_op(sketch, lhs_info, rhs_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionCLSubFixture =
+    DynamicFusionGpuElementwiseBinaryOneOpValidationFixture<CLTensor, CLAccessor, GpuSub, T>;
+
+template <typename T>
+using DynamicFusionCLSubBroadcastFixture =
+    DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture<CLTensor, CLAccessor, GpuSub, T>;
+
+template <typename T>
+using DynamicFusionCLSubTwoOpsFixture =
+    DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture<CLTensor, CLAccessor, GpuSub, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLSubFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeOneOp,
+                       DynamicFusionCLSubFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLSubBroadcastFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLSubBroadcastFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::TemporaryLimitedLargeShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(
+    RunSmallTwoOps,
+    DynamicFusionCLSubTwoOpsFixture<float>,
+    framework::DatasetMode::PRECOMMIT,
+    combine(combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                    datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes()),
+                            framework::dataset::make("DataType", {DataType::F32})),
+                    framework::dataset::make("InPlace", {false})),
+            framework::dataset::make("FuseTwoOps", {true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLSubFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLSubBroadcastFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLSubFixture<int32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::S32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLSubFixture<int16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::S16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionCLSubFixture<int16_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", {DataType::S16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLSubFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::U8})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
+
+TEST_SUITE_END() // SUB
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Tanh.cpp b/tests/validation/dynamic_fusion/gpu/cl/Tanh.cpp
new file mode 100644
index 0000000000..2560f3aab1
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Tanh.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/ActivationFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_f32(0.00001f);
+constexpr AbsoluteTolerance<float> tolerance_f16(0.001f);
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(TANH)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8), // Unsupported data type
+                                                    }),
+                framework::dataset::make("Expected", { true, true, false })),
+                input_info, expected)
+{
+    // Create a new workload sketch
+    CLCompileContext cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Fuse tanh
+    const ITensorInfo* src_info = context.create_tensor_info(input_info);
+
+    const bool res = static_cast<bool>(GpuTanh::validate_op(sketch, src_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionTanhOpFixture = DynamicFusionTanhValidationFixture<CLTensor, CLAccessor, GpuTanh, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionTanhOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionTanhOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::Small5dShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionTanhOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionTanhOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionTanhOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::Small5dShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionTanhOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // TANH
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/fixtures/ActivationLayerFixture.h b/tests/validation/fixtures/ActivationLayerFixture.h
index 91b43f0f24..a24ba8913e 100644
--- a/tests/validation/fixtures/ActivationLayerFixture.h
+++ b/tests/validation/fixtures/ActivationLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_ACTIVATION_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_ACTIVATION_LAYER_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_ACTIVATIONLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_ACTIVATIONLAYERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -47,12 +47,7 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ActivationValidationGenericFixture : public framework::Fixture
 {
 public:
-    ActivationValidationGenericFixture()
-        : _target(parameters->get_ctx<TensorType>())
-    {
-    }
 
-    template <typename...>
     void setup(TensorShape shape, bool in_place, ActivationLayerInfo::ActivationFunction function, float alpha_beta, DataType data_type, QuantizationInfo quantization_info)
     {
         ActivationLayerInfo info(function, alpha_beta, alpha_beta);
@@ -120,29 +115,28 @@ protected:
 
     TensorType compute_target(const TensorShape &shape, ActivationLayerInfo info)
     {
-        auto ctx = parameters->get_ctx<TensorType>();
         // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, _data_type, 1, _input_quantization_info, DataLayout::NCHW, ctx);
-        TensorType dst = create_tensor<TensorType>(shape, _data_type, 1, _output_quantization_info, DataLayout::NCHW, ctx);
+        TensorType src = create_tensor<TensorType>(shape, _data_type, 1, _input_quantization_info, DataLayout::NCHW);
+        TensorType dst = create_tensor<TensorType>(shape, _data_type, 1, _output_quantization_info, DataLayout::NCHW);
 
         // Create and configure function
-        FunctionType act_layer(ctx);
+        FunctionType act_layer;
 
         TensorType *dst_ptr = _in_place ? nullptr : &dst;
 
         act_layer.configure(&src, dst_ptr, info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
 
         if(!_in_place)
         {
             dst.allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
         }
 
         // Fill tensors
@@ -234,7 +228,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ActivationValidationFixture : public ActivationValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, bool in_place, ActivationLayerInfo::ActivationFunction function, float alpha_beta, DataType data_type)
     {
         ActivationValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, in_place, function, alpha_beta, data_type, QuantizationInfo());
@@ -245,7 +238,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ActivationValidationQuantizedFixture : public ActivationValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, bool in_place, ActivationLayerInfo::ActivationFunction function, float alpha_beta, DataType data_type, QuantizationInfo quantization_info)
     {
         ActivationValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, in_place, function, alpha_beta, data_type, quantization_info);
@@ -255,4 +247,4 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_ACTIVATION_LAYER_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_ACTIVATIONLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/AddMulAddFixture.h b/tests/validation/fixtures/AddMulAddFixture.h
new file mode 100644
index 0000000000..d13fef2f02
--- /dev/null
+++ b/tests/validation/fixtures/AddMulAddFixture.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_ADDMULADDFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_ADDMULADDFIXTURE_H
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
+#include "tests/validation/reference/DequantizationLayer.h"
+#include "tests/validation/reference/PixelWiseMultiplication.h"
+#include "tests/validation/reference/QuantizationLayer.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class AddMulAddGenericFixture : public framework::Fixture
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, ActivationLayerInfo &act_info, bool interm_out)
+    {
+        compute_target(shape, data_type, act_info, interm_out);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, DataType data_type)
+    {
+        switch(data_type)
+        {
+            case DataType::F32:
+                library->fill_tensor_uniform(tensor, i, -10.f, 10.f);
+                break;
+            case DataType::F16:
+                library->fill_tensor_uniform(tensor, i, -1.f, 1.f);
+                break;
+            default:
+                library->fill_tensor_uniform(tensor, i);
+                break;
+        }
+    }
+
+    void compute_target(const TensorShape &shape, DataType data_type, ActivationLayerInfo &act_info, bool interm_out)
+    {
+        TensorShape b_shape(shape.x());
+
+        // Create tensors
+        TensorType input1       = create_tensor<TensorType>(shape, data_type, 1, _input1_qinfo);
+        TensorType input2       = create_tensor<TensorType>(shape, data_type, 1, _input2_qinfo);
+        TensorType bn_mul       = create_tensor<TensorType>(b_shape, data_type, 1, _bn_mul_qinfo);
+        TensorType bn_add       = create_tensor<TensorType>(b_shape, data_type, 1, _bn_add_qinfo);
+        TensorType add_output   = create_tensor<TensorType>(shape, data_type, 1, _add_output_qinfo);
+        TensorType final_output = create_tensor<TensorType>(shape, data_type, 1, _final_output_qinfo);
+
+        // Create and configure function
+        FunctionType add_mul_add;
+        ARM_COMPUTE_ERROR_THROW_ON(add_mul_add.validate(input1.info(), input2.info(), bn_mul.info(),
+                                                        bn_add.info(), interm_out ? add_output.info() : nullptr, final_output.info(),
+                                                        ConvertPolicy::SATURATE, act_info));
+
+        add_mul_add.configure(&input1, &input2, &bn_mul, &bn_add, interm_out ? &add_output : nullptr,
+                              &final_output, ConvertPolicy::SATURATE, act_info);
+
+        // Allocate tensors
+        input1.allocator()->allocate();
+        input2.allocator()->allocate();
+        bn_mul.allocator()->allocate();
+        bn_add.allocator()->allocate();
+
+        if(interm_out)
+        {
+            add_output.allocator()->allocate();
+        }
+
+        final_output.allocator()->allocate();
+
+        // Fill tensors
+        fill(AccessorType(input1), 0, data_type);
+        fill(AccessorType(input2), 1, data_type);
+        fill(AccessorType(bn_mul), 2, data_type);
+        fill(AccessorType(bn_add), 3, data_type);
+
+        // // Compute function
+        add_mul_add.run();
+
+        _target = std::move(final_output);
+
+        if(interm_out)
+        {
+            _interm_target = std::move(add_output);
+        }
+    }
+
+    TensorType      _target{};
+    TensorType      _interm_target{};
+    SimpleTensor<T> _reference{};
+    SimpleTensor<T> _interm_reference{};
+
+    QuantizationInfo _input1_qinfo{};
+    QuantizationInfo _input2_qinfo{};
+    QuantizationInfo _bn_mul_qinfo{};
+    QuantizationInfo _bn_add_qinfo{};
+    QuantizationInfo _add_output_qinfo{};
+    QuantizationInfo _final_output_qinfo{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool interm_out>
+class AddMulAddFloatValidationFixture : public AddMulAddGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    using Parent = AddMulAddGenericFixture<TensorType, AccessorType, FunctionType, T>;
+
+    void setup(const TensorShape &shape, DataType data_type, ActivationLayerInfo act_info)
+    {
+        Parent::setup(shape, data_type, act_info, interm_out);
+        compute_reference(shape, data_type, act_info);
+    }
+
+    // Compute Reference is moved outside of the generic fixture because with the quantized data types,
+    // it becomes a very different implementation with intermediate tensors' data types being always float.
+    // This way the reference calculations are more readable and the size of the classes will be smaller
+    // due to unrepeated fill() and target() methods.
+    void compute_reference(const TensorShape &shape, DataType data_type, ActivationLayerInfo &act_info)
+    {
+        TensorShape b_shape(shape.x());
+
+        // Create reference
+        SimpleTensor<T> input1{ shape, data_type };
+        SimpleTensor<T> input2{ shape, data_type };
+        SimpleTensor<T> bn_mul{ b_shape, data_type };
+        SimpleTensor<T> bn_add{ b_shape, data_type };
+        SimpleTensor<T> add_output{ shape, data_type, 1 };
+
+        SimpleTensor<T> bn_mul_out{ shape, data_type };
+        SimpleTensor<T> bn_add_out{ shape, data_type };
+
+        // Fill reference
+        Parent::fill(input1, 0, data_type);
+        Parent::fill(input2, 1, data_type);
+        Parent::fill(bn_mul, 2, data_type);
+        Parent::fill(bn_add, 3, data_type);
+
+        reference::arithmetic_operation<T>(reference::ArithmeticOperation::ADD, input1, input2, add_output, ConvertPolicy::SATURATE);
+        bn_mul_out = reference::pixel_wise_multiplication<T, T, T>(add_output, bn_mul, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_UP, data_type);
+        reference::arithmetic_operation<T>(reference::ArithmeticOperation::ADD, bn_mul_out, bn_add, bn_add_out, ConvertPolicy::SATURATE);
+
+        if(interm_out)
+        {
+            Parent::_interm_reference = std::move(add_output);
+        }
+
+        if(act_info.enabled() && act_info.activation() != ActivationLayerInfo::ActivationFunction::IDENTITY)
+        {
+            Parent::_reference = reference::activation_layer(bn_add_out, act_info);
+        }
+        else
+        {
+            Parent::_reference = std::move(bn_add_out);
+        }
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool interm_out>
+class AddMulAddQuantizedValidationFixture : public AddMulAddGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    using Parent = AddMulAddGenericFixture<TensorType, AccessorType, FunctionType, T>;
+
+    void setup(const TensorShape &shape, DataType data_type, ActivationLayerInfo act_info,
+               QuantizationInfo input1_qinfo, QuantizationInfo input2_qinfo, QuantizationInfo bn_mul_qinfo,
+               QuantizationInfo bn_add_qinfo, QuantizationInfo add_output_qinfo, QuantizationInfo final_output_qinfo)
+    {
+        // Quantization arguments moved to class attributes to prevent long function declerations
+        Parent::_input1_qinfo       = input1_qinfo;
+        Parent::_input2_qinfo       = input2_qinfo;
+        Parent::_bn_mul_qinfo       = bn_mul_qinfo;
+        Parent::_bn_add_qinfo       = bn_add_qinfo;
+        Parent::_add_output_qinfo   = add_output_qinfo;
+        Parent::_final_output_qinfo = final_output_qinfo;
+
+        Parent::setup(shape, data_type, act_info, interm_out);
+        compute_reference(shape, data_type, act_info);
+    }
+
+    // Compute Reference is moved outside of the generic fixture because with the quantized data types,
+    // it becomes a very different implementation with intermediate tensors' data types being always float.
+    // This way the reference calculations are more readable and the size of the classes will be smaller
+    // due to unrepeated fill() and target() methods.
+    void compute_reference(const TensorShape &shape, DataType data_type, ActivationLayerInfo &act_info)
+    {
+        TensorShape b_shape(shape.x());
+
+        // Create reference
+        SimpleTensor<T> input1{ shape, data_type, 1, Parent::_input1_qinfo };
+        SimpleTensor<T> input2{ shape, data_type, 1, Parent::_input2_qinfo };
+        SimpleTensor<T> bn_mul{ b_shape, data_type, 1, Parent::_bn_mul_qinfo };
+        SimpleTensor<T> bn_add{ b_shape, data_type, 1, Parent::_bn_add_qinfo };
+
+        // Fill input tensors
+        Parent::fill(input1, 0, data_type);
+        Parent::fill(input2, 1, data_type);
+        Parent::fill(bn_mul, 2, data_type);
+        Parent::fill(bn_add, 3, data_type);
+
+        SimpleTensor<float> input1_dequantized = reference::dequantization_layer<float>(input1);
+        SimpleTensor<float> input2_dequantized = reference::dequantization_layer<float>(input2);
+        SimpleTensor<float> bn_mul_dequantized = reference::dequantization_layer<float>(bn_mul);
+        SimpleTensor<float> bn_add_dequantized = reference::dequantization_layer<float>(bn_add);
+
+        SimpleTensor<float> add_output_dequantized{ shape, DataType::F32 };
+        SimpleTensor<float> bn_add_out_dequantized{ shape, DataType::F32 };
+
+        reference::arithmetic_operation<float>(reference::ArithmeticOperation::ADD, input1_dequantized, input2_dequantized, add_output_dequantized, ConvertPolicy::SATURATE);
+        SimpleTensor<float> bn_mul_out_dequantized = reference::pixel_wise_multiplication<float, float, float>(add_output_dequantized, bn_mul_dequantized, 1.f, ConvertPolicy::SATURATE,
+                                                                                                               RoundingPolicy::TO_NEAREST_UP, DataType::F32);
+        reference::arithmetic_operation<float>(reference::ArithmeticOperation::ADD, bn_mul_out_dequantized, bn_add_dequantized, bn_add_out_dequantized, ConvertPolicy::SATURATE);
+
+        if(interm_out)
+        {
+            Parent::_interm_reference = reference::quantization_layer<float, T>(add_output_dequantized, data_type, Parent::_add_output_qinfo);
+        }
+
+        if(act_info.enabled() && act_info.activation() != ActivationLayerInfo::ActivationFunction::IDENTITY)
+        {
+            SimpleTensor<T> ref = reference::quantization_layer<float, T>(bn_add_out_dequantized, data_type, Parent::_final_output_qinfo);
+            Parent::_reference  = reference::activation_layer(ref, act_info);
+        }
+        else
+        {
+            Parent::_reference = reference::quantization_layer<float, T>(bn_add_out_dequantized, data_type, Parent::_final_output_qinfo);
+        }
+    }
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_ADDMULADDFIXTURE_H
diff --git a/tests/validation/fixtures/ArgMinMaxFixture.h b/tests/validation/fixtures/ArgMinMaxFixture.h
index f140c9846b..7a823568a8 100644
--- a/tests/validation/fixtures/ArgMinMaxFixture.h
+++ b/tests/validation/fixtures/ArgMinMaxFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,15 +42,14 @@ namespace test
 {
 namespace validation
 {
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
 class ArgMinMaxValidationBaseFixture : public framework::Fixture
 {
 public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info)
+    void setup(TensorShape shape, DataType input_type, DataType output_type, int axis, ReductionOperation op, QuantizationInfo q_info)
     {
-        _target    = compute_target(shape, data_type, axis, op, q_info);
-        _reference = compute_reference(shape, data_type, axis, op, q_info);
+        _target    = compute_target(shape, input_type, output_type, axis, op, q_info);
+        _reference = compute_reference(shape, input_type, output_type, axis, op, q_info);
     }
 
 protected:
@@ -80,7 +79,7 @@ protected:
             case DataType::QASYMM8:
             {
                 std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-                std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
+                std::uniform_int_distribution<uint32_t> distribution(bounds.first, bounds.second);
 
                 library->fill(tensor, distribution, 0);
                 break;
@@ -88,7 +87,7 @@ protected:
             case DataType::QASYMM8_SIGNED:
             {
                 std::pair<int, int> bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-                std::uniform_int_distribution<int8_t> distribution(bounds.first, bounds.second);
+                std::uniform_int_distribution<int32_t> distribution(bounds.first, bounds.second);
 
                 library->fill(tensor, distribution, 0);
                 break;
@@ -98,25 +97,25 @@ protected:
         }
     }
 
-    TensorType compute_target(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info)
+    TensorType compute_target(TensorShape &src_shape, DataType input_type, DataType output_type, int axis, ReductionOperation op, QuantizationInfo q_info)
     {
         // Create tensors
-        TensorType src = create_tensor<TensorType>(src_shape, data_type, 1, q_info);
-        TensorType dst;
+        TensorType src = create_tensor<TensorType>(src_shape, input_type, 1, q_info);
+        TensorType dst = create_tensor<TensorType>(compute_output_shape(src_shape, axis), output_type, 1, q_info);
 
         // Create and configure function
         FunctionType arg_min_max_layer;
         arg_min_max_layer.configure(&src, axis, &dst, op);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
@@ -127,41 +126,43 @@ protected:
         return dst;
     }
 
-    SimpleTensor<int32_t> compute_reference(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info)
+    TensorShape compute_output_shape(const TensorShape &src_shape, int axis)
+    {
+        return arm_compute::misc::shape_calculator::compute_reduced_shape(src_shape, axis, false);
+    }
+
+    SimpleTensor<T2> compute_reference(TensorShape &src_shape, DataType input_type, DataType output_type, int axis, ReductionOperation op, QuantizationInfo q_info)
     {
         // Create reference
-        SimpleTensor<T> src{ src_shape, data_type, 1, q_info };
+        SimpleTensor<T1> src{ src_shape, input_type, 1, q_info };
 
         // Fill reference
         fill(src);
 
-        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(src_shape, axis, false);
-        return reference::reduction_operation<T, int32_t>(src, output_shape, axis, op);
+        return reference::reduction_operation<T1, T2>(src, compute_output_shape(src_shape, axis), axis, op, output_type);
     }
 
-    TensorType            _target{};
-    SimpleTensor<int32_t> _reference{};
+    TensorType       _target{};
+    SimpleTensor<T2> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ArgMinMaxValidationQuantizedFixture : public ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
+class ArgMinMaxValidationQuantizedFixture : public ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T1, T2>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo quantization_info)
+    void setup(const TensorShape &shape, DataType input_type, DataType output_type, int axis, ReductionOperation op, QuantizationInfo quantization_info)
     {
-        ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, quantization_info);
+        ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, input_type, output_type, axis, op, quantization_info);
     }
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ArgMinMaxValidationFixture : public ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
+class ArgMinMaxValidationFixture : public ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T1, T2>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type, int axis, ReductionOperation op)
+    void setup(const TensorShape &shape, DataType input_type, DataType output_type, int axis, ReductionOperation op)
     {
-        ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, QuantizationInfo());
+        ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, input_type, output_type, axis, op, QuantizationInfo());
     }
 };
 } // namespace validation
diff --git a/tests/validation/fixtures/ArithmeticDivisionFixture.h b/tests/validation/fixtures/ArithmeticDivisionFixture.h
index 782939a960..e11a386130 100644
--- a/tests/validation/fixtures/ArithmeticDivisionFixture.h
+++ b/tests/validation/fixtures/ArithmeticDivisionFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticDivisionBroadcastValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type)
     {
         _target    = compute_target(shape0, shape1, data_type);
@@ -73,18 +72,18 @@ protected:
         FunctionType add;
         add.configure(&ref_src1, &ref_src2, &dst);
 
-        ARM_COMPUTE_EXPECT(ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(ref_src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(ref_src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         ref_src1.allocator()->allocate();
         ref_src2.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!ref_src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!ref_src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(ref_src1), 0);
@@ -117,7 +116,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticDivisionValidationFixture : public ArithmeticDivisionBroadcastValidationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape, DataType data_type)
     {
         ArithmeticDivisionBroadcastValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, shape, data_type);
diff --git a/tests/validation/fixtures/ArithmeticOperationsFixture.h b/tests/validation/fixtures/ArithmeticOperationsFixture.h
index 9ba7bd3ef7..0785af1151 100644
--- a/tests/validation/fixtures/ArithmeticOperationsFixture.h
+++ b/tests/validation/fixtures/ArithmeticOperationsFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,16 +45,14 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticOperationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
-    void setup(reference::ArithmeticOperation op, const TensorShape &shape0, const TensorShape &shape1,
-               DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, ActivationLayerInfo act_info, bool in_place)
+    void setup(reference::ArithmeticOperation op, const TensorShape &shape0, const TensorShape &shape1, DataType data_type, ConvertPolicy convert_policy,
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, ActivationLayerInfo act_info, bool is_inplace)
     {
-        _op        = op;
-        _act_info  = act_info;
-        _in_place  = in_place;
-        _target    = compute_target(shape0, shape1, data_type0, data_type1, output_data_type, convert_policy, qinfo0, qinfo1, qinfo_out);
-        _reference = compute_reference(shape0, shape1, data_type0, data_type1, output_data_type, convert_policy, qinfo0, qinfo1, qinfo_out);
+        _op         = op;
+        _act_info   = act_info;
+        _is_inplace = is_inplace;
+        _target     = compute_target(shape0, shape1, data_type, convert_policy, qinfo0, qinfo1, qinfo_out);
+        _reference  = compute_reference(shape0, shape1, data_type, convert_policy, qinfo0, qinfo1, qinfo_out);
     }
 
 protected:
@@ -64,31 +62,55 @@ protected:
         library->fill_tensor_uniform(tensor, i);
     }
 
-    TensorType compute_target(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy,
+    TensorType compute_target(const TensorShape &shape0, const TensorShape &shape1, DataType data_type, ConvertPolicy convert_policy,
                               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
     {
         // Create tensors
-        TensorType  ref_src1   = create_tensor<TensorType>(shape0, data_type0, 1, qinfo0);
-        TensorType  ref_src2   = create_tensor<TensorType>(shape1, data_type1, 1, qinfo1);
-        TensorType  dst        = create_tensor<TensorType>(TensorShape::broadcast_shape(shape0, shape1), output_data_type, 1, qinfo_out);
-        TensorType *dst_to_use = _in_place ? &ref_src1 : &dst;
+        const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1);
+        TensorType        ref_src1  = create_tensor<TensorType>(shape0, data_type, 1, qinfo0);
+        TensorType        ref_src2  = create_tensor<TensorType>(shape1, data_type, 1, qinfo1);
+        TensorType        dst       = create_tensor<TensorType>(out_shape, data_type, 1, qinfo_out);
+
+        // Check whether do in-place computation and whether inputs are broadcast compatible
+        TensorType *actual_dst = &dst;
+        if(_is_inplace)
+        {
+            bool src1_is_inplace = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) && (qinfo0 == qinfo_out);
+            bool src2_is_inplace = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) && (qinfo1 == qinfo_out);
+            bool do_in_place     = out_shape.total_size() != 0 && (src1_is_inplace || src2_is_inplace);
+            ARM_COMPUTE_ASSERT(do_in_place);
+
+            if(src1_is_inplace)
+            {
+                actual_dst = &ref_src1;
+            }
+            else
+            {
+                actual_dst = &ref_src2;
+            }
+        }
 
         // Create and configure function
         FunctionType arith_op;
-        arith_op.configure(&ref_src1, &ref_src2, dst_to_use, convert_policy, _act_info);
+        arith_op.configure(&ref_src1, &ref_src2, actual_dst, convert_policy, _act_info);
 
-        ARM_COMPUTE_EXPECT(ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst_to_use->info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(ref_src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(ref_src2.info()->is_resizable());
 
         // Allocate tensors
         ref_src1.allocator()->allocate();
         ref_src2.allocator()->allocate();
-        dst_to_use->allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst_to_use->info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!ref_src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!ref_src2.info()->is_resizable());
+
+        // If don't do in-place computation, still need to allocate original dst
+        if(!_is_inplace)
+        {
+            ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+            dst.allocator()->allocate();
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        }
 
         // Fill tensors
         fill(AccessorType(ref_src1), 0);
@@ -97,50 +119,40 @@ protected:
         // Compute function
         arith_op.run();
 
-        if(_in_place)
-        {
-            return ref_src1;
-        }
-        return dst;
+        return std::move(*actual_dst);
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape0, const TensorShape &shape1,
-                                      DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy,
+    SimpleTensor<T> compute_reference(const TensorShape &shape0, const TensorShape &shape1, DataType data_type, ConvertPolicy convert_policy,
                                       QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
     {
-        // current in-place implementation only supports same metadata of input and output tensors.
-        // By ignoring output quantization information here, we can make test cases implementation much simpler.
-        QuantizationInfo output_qinfo = _in_place ? qinfo0 : qinfo_out;
-
         // Create reference
-        SimpleTensor<T> ref_src1{ shape0, data_type0, 1, qinfo0 };
-        SimpleTensor<T> ref_src2{ shape1, data_type1, 1, qinfo1 };
-        SimpleTensor<T> ref_dst{ TensorShape::broadcast_shape(shape0, shape1), output_data_type, 1, output_qinfo };
+        SimpleTensor<T> ref_src1{ shape0, data_type, 1, qinfo0 };
+        SimpleTensor<T> ref_src2{ shape1, data_type, 1, qinfo1 };
+        SimpleTensor<T> ref_dst{ TensorShape::broadcast_shape(shape0, shape1), data_type, 1, qinfo_out };
 
         // Fill reference
         fill(ref_src1, 0);
         fill(ref_src2, 1);
 
         auto result = reference::arithmetic_operation<T>(_op, ref_src1, ref_src2, ref_dst, convert_policy);
-        return _act_info.enabled() ? reference::activation_layer(result, _act_info, output_qinfo) : result;
+        return _act_info.enabled() ? reference::activation_layer(result, _act_info, qinfo_out) : result;
     }
 
     TensorType                     _target{};
     SimpleTensor<T>                _reference{};
     reference::ArithmeticOperation _op{ reference::ArithmeticOperation::ADD };
     ActivationLayerInfo            _act_info{};
-    bool                           _in_place{};
+    bool                           _is_inplace{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class ArithmeticAdditionBroadcastValidationFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type, ConvertPolicy convert_policy, bool is_inplace)
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape0, shape1, data_type0, data_type1,
-                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), false);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape0, shape1, data_type, convert_policy,
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), is_inplace);
     }
 };
 
@@ -148,11 +160,10 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticAdditionValidationFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy)
+    void setup(const TensorShape &shape, DataType data_type, ConvertPolicy convert_policy, bool is_inplace)
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape, shape, data_type0, data_type1,
-                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), false);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape, shape, data_type, convert_policy,
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), is_inplace);
     }
 };
 
@@ -160,11 +171,10 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticAdditionBroadcastValidationFloatFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info, bool is_inplace)
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape0, shape1, data_type0, data_type1,
-                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, false);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape0, shape1, data_type, convert_policy,
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -172,11 +182,10 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticAdditionValidationFloatFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape, DataType data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info, bool is_inplace)
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape, shape, data_type0, data_type1,
-                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, false);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape, shape, data_type, convert_policy,
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -184,13 +193,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticAdditionValidationQuantizedFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+    void setup(const TensorShape &shape, DataType data_type, ConvertPolicy convert_policy, QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
 
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape, shape, data_type0, data_type1,
-                                                                                            output_data_type, convert_policy, qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), false);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape, shape, data_type, convert_policy,
+                                                                                            qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), is_inplace);
     }
 };
 
@@ -198,13 +205,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticAdditionValidationQuantizedBroadcastFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type,
-               ConvertPolicy convert_policy, QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type, ConvertPolicy convert_policy, QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out,
+               bool is_inplace)
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape0, shape1,
-                                                                                            data_type0, data_type1, output_data_type, convert_policy,
-                                                                                            qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), false);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape0, shape1, data_type, convert_policy,
+                                                                                            qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), is_inplace);
     }
 };
 
@@ -212,12 +217,10 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticSubtractionBroadcastValidationFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, bool in_place)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type, ConvertPolicy convert_policy, bool is_inplace)
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape0, shape1,
-                                                                                            data_type0, data_type1, output_data_type, convert_policy,
-                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), in_place);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape0, shape1, data_type, convert_policy,
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), is_inplace);
     }
 };
 
@@ -225,13 +228,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticSubtractionBroadcastValidationFloatFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info,
-               bool in_place)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info,
+               bool is_inplace)
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape0, shape1,
-                                                                                            data_type0, data_type1, output_data_type, convert_policy,
-                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, in_place);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape0, shape1, data_type, convert_policy,
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -239,12 +240,10 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticSubtractionValidationFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, bool in_place)
+    void setup(const TensorShape &shape, DataType data_type, ConvertPolicy convert_policy, bool is_inplace)
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape, shape,
-                                                                                            data_type0, data_type1, output_data_type, convert_policy,
-                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), in_place);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape, shape, data_type, convert_policy,
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), is_inplace);
     }
 };
 
@@ -252,12 +251,10 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticSubtractionValidationFloatFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info, bool in_place)
+    void setup(const TensorShape &shape, DataType data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info, bool is_inplace)
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape, shape,
-                                                                                            data_type0, data_type1, output_data_type, convert_policy,
-                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, in_place);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape, shape, data_type, convert_policy,
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -265,14 +262,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticSubtractionValidationQuantizedFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool in_place)
+    void setup(const TensorShape &shape, DataType data_type, ConvertPolicy convert_policy, QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
 
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape, shape,
-                                                                                            data_type0, data_type1, output_data_type,
-                                                                                            convert_policy, qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), in_place);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape, shape, data_type, convert_policy,
+                                                                                            qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), is_inplace);
     }
 };
 
@@ -280,13 +274,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticSubtractionValidationQuantizedBroadcastFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type,
-               ConvertPolicy convert_policy, QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool in_place)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type, ConvertPolicy convert_policy, QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out,
+               bool is_inplace)
     {
-        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape0, shape1,
-                                                                                            data_type0, data_type1, output_data_type, convert_policy,
-                                                                                            qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), in_place);
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape0, shape1, data_type, convert_policy,
+                                                                                            qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), is_inplace);
     }
 };
 } // namespace validation
diff --git a/tests/validation/fixtures/BatchNormalizationLayerFixture.h b/tests/validation/fixtures/BatchNormalizationLayerFixture.h
index f5a5420df3..54a0ed9e09 100644
--- a/tests/validation/fixtures/BatchNormalizationLayerFixture.h
+++ b/tests/validation/fixtures/BatchNormalizationLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class BatchNormalizationLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape0, TensorShape shape1, float epsilon, bool use_beta, bool use_gamma, ActivationLayerInfo act_info, DataType dt, DataLayout data_layout)
     {
         _data_type = dt;
@@ -111,12 +110,12 @@ protected:
         TensorType *gamma_ptr = _use_gamma ? &gamma : nullptr;
         norm.configure(&src, &dst, &mean, &var, beta_ptr, gamma_ptr, epsilon, act_info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(mean.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(var.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(beta.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(gamma.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(mean.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(var.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(beta.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(gamma.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -126,12 +125,12 @@ protected:
         beta.allocator()->allocate();
         gamma.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!mean.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!var.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!beta.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!gamma.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!mean.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!var.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!beta.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!gamma.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), AccessorType(mean), AccessorType(var), AccessorType(beta), AccessorType(gamma));
diff --git a/tests/validation/fixtures/BatchNormalizationLayerFusionFixture.h b/tests/validation/fixtures/BatchNormalizationLayerFusionFixture.h
index ccacfeb062..161eeb0ef4 100644
--- a/tests/validation/fixtures/BatchNormalizationLayerFusionFixture.h
+++ b/tests/validation/fixtures/BatchNormalizationLayerFusionFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,6 @@ template <typename TensorType, typename AccessorType, typename ConvolutionFuncti
 class BatchNormalizationLayerFusionValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape src_shape, TensorShape w_shape, TensorShape b_shape, TensorShape dst_shape, PadStrideInfo info, Size2D dilation,
                bool use_conv_b, bool use_beta, bool use_gamma, float epsilon, DataType dt, DataLayout data_layout)
     {
@@ -110,16 +109,16 @@ protected:
         fuse_fn.configure(&conv_w, &bn_mean, &bn_var, &fused_w, &fused_b, conv_b_ptr, beta_ptr, gamma_ptr, epsilon);
         conv_fn.configure(&src, &fused_w, &fused_b, &dst, info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(conv_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(conv_b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bn_mean.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bn_var.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bn_beta.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bn_gamma.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(fused_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(fused_b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(conv_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(conv_b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bn_mean.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bn_var.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bn_beta.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bn_gamma.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(fused_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(fused_b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -133,16 +132,16 @@ protected:
         fused_b.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!conv_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!conv_b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bn_mean.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bn_var.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bn_beta.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bn_gamma.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!fused_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!fused_b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!conv_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!conv_b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bn_mean.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bn_var.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bn_beta.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bn_gamma.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!fused_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!fused_b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src),
diff --git a/tests/validation/fixtures/BatchToSpaceLayerFixture.h b/tests/validation/fixtures/BatchToSpaceLayerFixture.h
index 796afd1c5d..56a6109dbc 100644
--- a/tests/validation/fixtures/BatchToSpaceLayerFixture.h
+++ b/tests/validation/fixtures/BatchToSpaceLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_TEST_BATCH_TO_SPACE_LAYER_FIXTURE
 #define ARM_COMPUTE_TEST_BATCH_TO_SPACE_LAYER_FIXTURE
 
+#include "arm_compute/core/Helpers.h"
 #include "tests/Globals.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
@@ -39,11 +40,10 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class BatchToSpaceLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
-    void setup(TensorShape input_shape, TensorShape block_shape_shape, TensorShape output_shape, DataType data_type, DataLayout data_layout)
+    void setup(const TensorShape &input_shape, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, const TensorShape &output_shape, DataType data_type, DataLayout data_layout)
     {
-        _target    = compute_target(input_shape, block_shape_shape, output_shape, data_type, data_layout);
-        _reference = compute_reference(input_shape, block_shape_shape, output_shape, data_type);
+        _target    = compute_target(input_shape, block_shape, crop_info, output_shape, data_type, data_layout);
+        _reference = compute_reference(input_shape, block_shape, crop_info, output_shape, data_type);
     }
 
 protected:
@@ -56,9 +56,10 @@ protected:
         DistributionType distribution{ T(-1.0f), T(1.0f) };
         library->fill(tensor, distribution, i);
     }
-    TensorType compute_target(TensorShape input_shape, TensorShape block_shape_shape, TensorShape output_shape,
+    TensorType compute_target(TensorShape input_shape, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, TensorShape output_shape,
                               DataType data_type, DataLayout data_layout)
     {
+        ARM_COMPUTE_ERROR_ON(block_shape.size() != 2U); // Only support batch to 2D space (x, y) for now
         if(data_layout == DataLayout::NHWC)
         {
             permute(input_shape, PermutationVector(2U, 0U, 1U));
@@ -66,64 +67,49 @@ protected:
         }
 
         // Create tensors
-        TensorType input       = create_tensor<TensorType>(input_shape, data_type, 1, QuantizationInfo(), data_layout);
-        TensorType block_shape = create_tensor<TensorType>(block_shape_shape, DataType::S32);
-        TensorType output      = create_tensor<TensorType>(output_shape, data_type, 1, QuantizationInfo(), data_layout);
+        TensorType input  = create_tensor<TensorType>(input_shape, data_type, 1, QuantizationInfo(), data_layout);
+        TensorType output = create_tensor<TensorType>(output_shape, data_type, 1, QuantizationInfo(), data_layout);
 
         // Create and configure function
         FunctionType batch_to_space;
-        batch_to_space.configure(&input, &block_shape, &output);
+        batch_to_space.configure(&input, block_shape.at(0), block_shape.at(1), &output, crop_info);
 
-        ARM_COMPUTE_EXPECT(input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(block_shape.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(output.info()->is_resizable());
 
         // Allocate tensors
         input.allocator()->allocate();
-        block_shape.allocator()->allocate();
         output.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!block_shape.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!output.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(input), 0);
-        {
-            auto      block_shape_data = AccessorType(block_shape);
-            const int idx_width        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-            for(unsigned int i = 0; i < block_shape_shape.x(); ++i)
-            {
-                static_cast<int32_t *>(block_shape_data.data())[i] = output_shape[i + idx_width] / input_shape[i + idx_width];
-            }
-        }
         // Compute function
         batch_to_space.run();
 
         return output;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &block_shape_shape,
-                                      const TensorShape &output_shape, DataType data_type)
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const std::vector<int32_t> &block_shape,
+                                      const CropInfo &crop_info, const TensorShape &output_shape, DataType data_type)
     {
+        ARM_COMPUTE_ERROR_ON(block_shape.size() != 2U); // Only support batch to 2D space (x, y) for now
         // Create reference
-        SimpleTensor<T>       input{ input_shape, data_type };
-        SimpleTensor<int32_t> block_shape{ block_shape_shape, DataType::S32 };
+        SimpleTensor<T> input{ input_shape, data_type };
 
         // Fill reference
         fill(input, 0);
-        for(unsigned int i = 0; i < block_shape_shape.x(); ++i)
-        {
-            block_shape[i] = output_shape[i] / input_shape[i];
-        }
 
         // Compute reference
-        return reference::batch_to_space(input, block_shape, output_shape);
+        return reference::batch_to_space(input, block_shape, crop_info, output_shape);
     }
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/fixtures/BitwiseAndFixture.h b/tests/validation/fixtures/BitwiseAndFixture.h
index 6c8e1b1f6e..745a34058e 100644
--- a/tests/validation/fixtures/BitwiseAndFixture.h
+++ b/tests/validation/fixtures/BitwiseAndFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class BitwiseAndValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
         _target    = compute_target(shape, data_type);
@@ -69,17 +68,17 @@ protected:
 
         bitwise_and.configure(&src1, &src2, &dst);
 
-        ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src1.allocator()->allocate();
         src2.allocator()->allocate();
         dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src1), 0);
diff --git a/tests/validation/fixtures/BitwiseNotFixture.h b/tests/validation/fixtures/BitwiseNotFixture.h
index c6affcfad6..bdfd255156 100644
--- a/tests/validation/fixtures/BitwiseNotFixture.h
+++ b/tests/validation/fixtures/BitwiseNotFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class BitwiseNotValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
         _target    = compute_target(shape, data_type);
@@ -68,14 +67,14 @@ protected:
 
         bitwise_not.configure(&src, &dst);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/BitwiseOrFixture.h b/tests/validation/fixtures/BitwiseOrFixture.h
index a40f635b9e..03560e0171 100644
--- a/tests/validation/fixtures/BitwiseOrFixture.h
+++ b/tests/validation/fixtures/BitwiseOrFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class BitwiseOrValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
         _target    = compute_target(shape, data_type);
@@ -69,17 +68,17 @@ protected:
 
         bitwise_or.configure(&src1, &src2, &dst);
 
-        ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src1.allocator()->allocate();
         src2.allocator()->allocate();
         dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src1), 0);
diff --git a/tests/validation/fixtures/BitwiseXorFixture.h b/tests/validation/fixtures/BitwiseXorFixture.h
index c103033fe6..4872b231a5 100644
--- a/tests/validation/fixtures/BitwiseXorFixture.h
+++ b/tests/validation/fixtures/BitwiseXorFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class BitwiseXorValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
         _target    = compute_target(shape, data_type);
@@ -69,17 +68,17 @@ protected:
 
         bitwise_xor.configure(&src1, &src2, &dst);
 
-        ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src1.allocator()->allocate();
         src2.allocator()->allocate();
         dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src1), 0);
diff --git a/tests/validation/fixtures/BoundingBoxTransformFixture.h b/tests/validation/fixtures/BoundingBoxTransformFixture.h
index 7155848b4d..03edaeab16 100644
--- a/tests/validation/fixtures/BoundingBoxTransformFixture.h
+++ b/tests/validation/fixtures/BoundingBoxTransformFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -102,7 +102,6 @@ class BoundingBoxTransformGenericFixture : public framework::Fixture
 public:
     using TDeltas = typename std::conditional<std::is_same<typename std::decay<T>::type, uint16_t>::value, uint8_t, T>::type;
 
-    template <typename...>
     void setup(TensorShape deltas_shape, const BoundingBoxTransformInfo &info, DataType data_type, QuantizationInfo deltas_qinfo)
     {
         const bool is_qasymm16 = data_type == DataType::QASYMM16;
@@ -157,17 +156,17 @@ protected:
         FunctionType bbox_transform;
         bbox_transform.configure(&boxes, &pred_boxes, &deltas, bbox_info);
 
-        ARM_COMPUTE_EXPECT(deltas.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(boxes.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(pred_boxes.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(deltas.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(boxes.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(pred_boxes.info()->is_resizable());
 
         // Allocate tensors
         deltas.allocator()->allocate();
         boxes.allocator()->allocate();
         pred_boxes.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!deltas.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!boxes.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!deltas.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!boxes.info()->is_resizable());
 
         // Fill tensors
         TensorShape        img_shape(bbox_info.scale() * bbox_info.img_width(), bbox_info.scale() * bbox_info.img_height());
@@ -215,7 +214,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class BoundingBoxTransformFixture : public BoundingBoxTransformGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape deltas_shape, const BoundingBoxTransformInfo &info, DataType data_type)
     {
         BoundingBoxTransformGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(deltas_shape, info, data_type, QuantizationInfo());
@@ -228,7 +226,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class BoundingBoxTransformQuantizedFixture : public BoundingBoxTransformGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape deltas_shape, const BoundingBoxTransformInfo &info, DataType data_type, QuantizationInfo deltas_qinfo)
     {
         BoundingBoxTransformGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(deltas_shape, info, data_type, deltas_qinfo);
diff --git a/tests/validation/fixtures/CastFixture.h b/tests/validation/fixtures/CastFixture.h
index c9764af37c..e9d624e6f3 100644
--- a/tests/validation/fixtures/CastFixture.h
+++ b/tests/validation/fixtures/CastFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class CastValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType dt_in, DataType dt_out, ConvertPolicy policy)
     {
         _target    = compute_target(shape, dt_in, dt_out, policy);
@@ -86,6 +85,16 @@ protected:
                     library->fill_tensor_uniform(tensor, i, static_cast<int32_t>(signed_min), static_cast<int32_t>(signed_max));
                     break;
                 }
+                case DataType::U64:
+                {
+                    library->fill_tensor_uniform(tensor, i, static_cast<uint64_t>(unsigned_min), static_cast<uint64_t>(unsigned_max));
+                    break;
+                }
+                case DataType::S64:
+                {
+                    library->fill_tensor_uniform(tensor, i, static_cast<int64_t>(signed_min), static_cast<int64_t>(signed_max));
+                    break;
+                }
                 default:
                     ARM_COMPUTE_ERROR("NOT SUPPORTED!");
             }
@@ -106,15 +115,15 @@ protected:
         FunctionType cast;
         cast.configure(&src, &dst, policy);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0, dt_in, dt_out);
diff --git a/tests/validation/fixtures/ChannelShuffleLayerFixture.h b/tests/validation/fixtures/ChannelShuffleLayerFixture.h
index de718fbbaa..530dba3893 100644
--- a/tests/validation/fixtures/ChannelShuffleLayerFixture.h
+++ b/tests/validation/fixtures/ChannelShuffleLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ChannelShuffleLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, unsigned int num_groups, DataType data_type, DataLayout data_layout)
     {
         _target    = compute_target(shape, data_type, num_groups, data_layout);
@@ -75,15 +74,15 @@ protected:
         FunctionType channel_shuffle_func;
         channel_shuffle_func.configure(&src, &dst, num_groups);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/Col2ImFixture.h b/tests/validation/fixtures/Col2ImFixture.h
index f8673af38a..4d56d607b7 100644
--- a/tests/validation/fixtures/Col2ImFixture.h
+++ b/tests/validation/fixtures/Col2ImFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,10 +45,9 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool batch_size_on_z>
-class Col2ImValidationFixture : public framework::Fixture
+class Col2ImOpValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, const unsigned int convolved_width, unsigned int convolved_height, unsigned int num_groups, DataType data_type)
     {
         const Size2D convolved_dims(convolved_width, convolved_height);
@@ -74,23 +73,28 @@ protected:
 
         // Create and configure function
         FunctionType col2im_func;
-        col2im_func.configure(&src, &dst, convolved_dims, num_groups);
+        col2im_func.configure(src.info(), dst.info(), convolved_dims, num_groups);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
 
+        arm_compute::ITensorPack pack =
+        {
+            { arm_compute::TensorType::ACL_SRC, &src },
+            { arm_compute::TensorType::ACL_DST, &dst }
+        };
         // Compute function
-        col2im_func.run();
+        col2im_func.run(pack);
 
         return dst;
     }
diff --git a/tests/validation/fixtures/ComparisonFixture.h b/tests/validation/fixtures/ComparisonFixture.h
index 43da0ae49c..f25d5abb73 100644
--- a/tests/validation/fixtures/ComparisonFixture.h
+++ b/tests/validation/fixtures/ComparisonFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ComparisonValidationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(ComparisonOperation op, const TensorShape &shape0, const TensorShape &shape1, DataType data_type, QuantizationInfo qinfo0, QuantizationInfo qinfo1)
     {
         _target    = compute_target(op, shape0, shape1, data_type, qinfo0, qinfo1);
@@ -71,18 +70,18 @@ protected:
         FunctionType comp_op;
         comp_op.configure(&ref_src1, &ref_src2, &dst, op);
 
-        ARM_COMPUTE_EXPECT(ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(ref_src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(ref_src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         ref_src1.allocator()->allocate();
         ref_src2.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!ref_src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!ref_src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(ref_src1), 0);
@@ -117,7 +116,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ComparisonBroadcastValidationFixture : public ComparisonValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(ComparisonOperation op, const TensorShape &shape0, const TensorShape &shape1, DataType data_type)
     {
         ComparisonValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape0, shape1, data_type, QuantizationInfo(), QuantizationInfo());
@@ -128,7 +126,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ComparisonValidationFixture : public ComparisonValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(ComparisonOperation op, const TensorShape &shape, DataType data_type)
     {
         ComparisonValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape, shape, data_type, QuantizationInfo(), QuantizationInfo());
@@ -139,7 +136,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ComparisonValidationQuantizedFixture : public ComparisonValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(ComparisonOperation op, const TensorShape &shape, DataType data_type, QuantizationInfo qinfo0, QuantizationInfo qinfo1)
 
     {
@@ -151,7 +147,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ComparisonQuantizedBroadcastValidationFixture : public ComparisonValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(ComparisonOperation op, const TensorShape &shape0, const TensorShape &shape1, DataType data_type, QuantizationInfo qinfo0, QuantizationInfo qinfo1)
     {
         ComparisonValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape0, shape1, data_type, qinfo0, qinfo1);
diff --git a/tests/validation/fixtures/ComputeAllAnchorsFixture.h b/tests/validation/fixtures/ComputeAllAnchorsFixture.h
index f385cb81f0..620f1b53fa 100644
--- a/tests/validation/fixtures/ComputeAllAnchorsFixture.h
+++ b/tests/validation/fixtures/ComputeAllAnchorsFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ComputeAllAnchorsGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(size_t num_anchors, const ComputeAnchorsInfo &info, DataType data_type, QuantizationInfo qinfo)
     {
         _target    = compute_target(num_anchors, data_type, info, qinfo);
@@ -69,13 +68,13 @@ protected:
         FunctionType compute_all_anchors;
         compute_all_anchors.configure(&anchors, &all_anchors, info);
 
-        ARM_COMPUTE_EXPECT(all_anchors.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(all_anchors.info()->is_resizable());
 
         // Allocate tensors
         all_anchors.allocator()->allocate();
         anchors.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!all_anchors.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!all_anchors.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(anchors));
@@ -107,7 +106,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ComputeAllAnchorsFixture : public ComputeAllAnchorsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(size_t num_anchors, const ComputeAnchorsInfo &info, DataType data_type)
     {
         ComputeAllAnchorsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(num_anchors, info, data_type, QuantizationInfo());
@@ -118,7 +116,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ComputeAllAnchorsQuantizedFixture : public ComputeAllAnchorsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(size_t num_anchors, const ComputeAnchorsInfo &info, DataType data_type, QuantizationInfo qinfo)
     {
         ComputeAllAnchorsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(num_anchors, info, data_type, qinfo);
diff --git a/tests/validation/fixtures/ConcatenateLayerFixture.h b/tests/validation/fixtures/ConcatenateLayerFixture.h
index d9615ff8b5..3a021661ac 100644
--- a/tests/validation/fixtures/ConcatenateLayerFixture.h
+++ b/tests/validation/fixtures/ConcatenateLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,6 @@ private:
     using SrcITensorType = typename std::conditional<CI, const ITensorType, ITensorType>::type;
 
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, unsigned int axis)
     {
         // Create input shapes
@@ -119,20 +118,20 @@ protected:
 
         for(auto &src : srcs)
         {
-            ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(src.info()->is_resizable());
         }
 
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         for(auto &src : srcs)
         {
             src.allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
         }
 
         dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         int i = 0;
diff --git a/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h b/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h
index d7984830f5..7ad14e1b40 100644
--- a/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h
+++ b/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ConvertFullyConnectedWeightsValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, unsigned int weights_w, DataLayout training_data_layout, DataType data_type)
     {
         const unsigned int height = input_shape.x() * input_shape.y() * input_shape.z();
@@ -61,7 +60,7 @@ protected:
         {
             case DataType::QASYMM8:
             {
-                std::uniform_int_distribution<uint8_t> distribution(0, 10);
+                std::uniform_int_distribution<uint32_t> distribution(0, 10);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -93,15 +92,15 @@ protected:
 
         convert_weights.configure(&src, &dst, input_shape, training_data_layout);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index a4db49fc8e..0622e5e6f0 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_CONVOLUTION_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_CONVOLUTION_LAYER_FIXTURE
+
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_CONVOLUTIONLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_CONVOLUTIONLAYERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/graph/Utils.h"
+#ifdef ARM_COMPUTE_OPENCL_ENABLED
+#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
+#endif // ARM_COMPUTE_OPENCL_ENABLED
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/graph/mutators/MutatorUtils.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -35,10 +42,12 @@
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/ActivationLayer.h"
 #include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/PadLayer.h"
 #include "tests/validation/reference/Permute.h"
 #include "tests/validation/reference/Utils.h"
 
 #include <random>
+#include <type_traits>
 
 namespace arm_compute
 {
@@ -49,13 +58,30 @@ namespace validation
 namespace detail
 {
 template <typename ConvolutionFunction, typename TensorType>
-void configure_conv_function(ConvolutionFunction &func,
+#ifdef ARM_COMPUTE_OPENCL_ENABLED
+std::enable_if_t<!std::is_same<ConvolutionFunction, CLGEMMConvolutionLayer>::value, void>
+#else // ARM_COMPUTE_OPENCL_ENABLED
+void
+#endif // ARM_COMPUTE_OPENCL_ENABLED
+configure_conv_function(ConvolutionFunction &func,
+                             TensorType *src, const TensorType *weights, const TensorType *bias, TensorType *dst,
+                             const PadStrideInfo &info, const WeightsInfo &weights_info,
+                             const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+    func.configure(src, weights, bias, dst, info, weights_info, dilation, act_info, false /* enable_fast_math */, num_groups);
+}
+
+#ifdef ARM_COMPUTE_OPENCL_ENABLED
+template <typename ConvolutionFunction, typename TensorType>
+std::enable_if_t<std::is_same<ConvolutionFunction, CLGEMMConvolutionLayer>::value, void>
+configure_conv_function(ConvolutionFunction &func,
                              TensorType *src, const TensorType *weights, const TensorType *bias, TensorType *dst,
                              const PadStrideInfo &info, const WeightsInfo &weights_info,
                              const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     func.configure(src, weights, bias, dst, info, weights_info, dilation, act_info, num_groups);
 }
+#endif // ARM_COMPUTE_OPENCL_ENABLED
 } // namespace detail
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW>
@@ -66,26 +92,84 @@ public:
                   || std::is_same<typename std::decay<T>::type, int8_t>::value,
                   int32_t, T >::type;
 
+    void setup_quantization(TensorShape input_shape, TensorShape weights_shape, QuantizationInfo &input_q_info,
+        QuantizationInfo &weights_q_info, DataType data_type)
+    {
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + _hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        _quantization_info = QuantizationInfo(scale_lhs, offset_lhs);
+        _weight_quantization_info = QuantizationInfo(scale_rhs, offset_rhs);
+
+        QuantizationHint q_hint = suggest_conv_dst_q_info_and_bias(input_q_info, weights_q_info,
+            weights_shape.y() /* heights */, weights_shape.x() /* width */, input_shape.z() /* channels */,
+            data_type, 0.5f /* bias_fraction */);
+
+        _dst_q_info = q_hint.q_info;
+        _min_bias = q_hint.bias_min;
+        _max_bias = q_hint.bias_max;
+    }
+
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights,
-               DataType data_type, DataType weights_data_type, DataLayout data_layout, QuantizationInfo quantization_info, QuantizationInfo weight_quantization_info, ActivationLayerInfo act_info)
+               DataType data_type, DataType weights_data_type, DataLayout data_layout, QuantizationInfo quantization_info, QuantizationInfo weight_quantization_info, ActivationLayerInfo act_info,
+               bool mixed_layout = false, PaddingList pre_pad_layer = PaddingList({}), bool padded_weights = false)
     {
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        _hash = input_shape[0] + input_shape[1] + input_shape[2] + input_shape[3] +
+            + weights_shape[0] + weights_shape[1] + weights_shape[2] + weights_shape[3] +
+            mixed_layout + (data_type == DataType::QASYMM8_SIGNED) + (data_layout == DataLayout::NHWC);
+
+        _mixed_layout             = mixed_layout;
         _data_type                = data_type;
         _weights_data_type        = weights_data_type;
-        _is_quantized             = is_data_type_quantized_asymmetric(data_type);
+        const bool is_quantized   = is_data_type_quantized(weights_data_type);
         _is_bfloat16              = data_type == DataType::BFLOAT16;
-        _bias_data_type           = _is_quantized ? DataType::S32 : (_is_bfloat16 ? DataType::F32 : data_type);
+        _bias_data_type           = is_quantized ? DataType::S32 : (_is_bfloat16 ? DataType::F32 : data_type);
         _output_data_type         = _is_bfloat16 ? DataType::F32 : data_type;
         _quantization_info        = quantization_info;
         _weight_quantization_info = weight_quantization_info;
         _data_layout              = data_layout;
+        _dst_q_info               = quantization_info;
+
+        if(is_quantized && !is_data_type_quantized_symmetric(weights_data_type) && (!act_info.enabled() || act_info.activation() == ActivationFunction::IDENTITY))
+        {
+            setup_quantization(input_shape, weights_shape, _quantization_info, _weight_quantization_info, data_type);
+            _use_dynamic_output_quant = true;
+        }
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info);
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info, pre_pad_layer, padded_weights);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info, pre_pad_layer);
     }
 
 protected:
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
+    {
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+
+        // Compute Convolution function
+        layer.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(_data_layout);
+        dst.info()->set_data_layout(_data_layout);
+    }
+
     void regularize_values(void *values, size_t size)
     {
         float *fvalues = static_cast<float *>(values);
@@ -102,16 +186,34 @@ protected:
         {
             case DataType::QASYMM8:
             {
-                std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-                std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
-                library->fill(tensor, distribution, i);
+                if(_use_dynamic_output_quant)
+                {
+                    std::uniform_int_distribution<int32_t> distribution(0, 255);
+                    library->fill(tensor, distribution, i);
+                }
+                else
+                {
+                    // Legacy initialization in case the output quantization info can't be reliably estimated
+                    std::pair<int, int>                     bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                    std::uniform_int_distribution<uint32_t> distribution(bounds.first, bounds.second);
+                    library->fill(tensor, distribution, i);
+                }
                 break;
             }
             case DataType::QASYMM8_SIGNED:
             {
-                std::pair<int, int> bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-                std::uniform_int_distribution<int8_t> distribution(bounds.first, bounds.second);
-                library->fill(tensor, distribution, i);
+                if(_use_dynamic_output_quant)
+                {
+                    std::uniform_int_distribution<int32_t> distribution(-128, 127);
+                    library->fill(tensor, distribution, i);
+                }
+                else
+                {
+                    // Legacy initialization in case the output quantization info can't be reliably estimated
+                    std::pair<int, int>                    bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                    std::uniform_int_distribution<int32_t> distribution(bounds.first, bounds.second);
+                    library->fill(tensor, distribution, i);
+                }
                 break;
             }
             case DataType::QSYMM8_PER_CHANNEL:
@@ -130,13 +232,13 @@ protected:
                         max_bound = bounds.second;
                     }
                 }
-                std::uniform_int_distribution<int8_t> distribution(min_bound, max_bound);
+                std::uniform_int_distribution<int32_t> distribution(min_bound, max_bound);
                 library->fill(tensor, distribution, i);
                 break;
             }
             case DataType::S32:
             {
-                std::uniform_int_distribution<int32_t> distribution(-100, 100);
+                std::uniform_int_distribution<int32_t> distribution(_min_bias, _max_bias);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -163,8 +265,9 @@ protected:
         }
     }
 
+    // given input is IN nchw format
     TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &info,
-                              bool reshape_weights, const Size2D &dilation, const ActivationLayerInfo act_info)
+                              bool reshape_weights, const Size2D &dilation, const ActivationLayerInfo act_info, PaddingList pre_pad_layer = PaddingList({}), bool padded_weights = false)
     {
         ARM_COMPUTE_ERROR_ON((input_shape[2] % weights_shape[2]) != 0);
 
@@ -175,6 +278,18 @@ protected:
             permute(input_shape, PermutationVector(2U, 0U, 1U));
             permute(weights_shape, PermutationVector(2U, 0U, 1U));
             permute(output_shape, PermutationVector(2U, 0U, 1U));
+
+            if(pre_pad_layer.size() > 0)
+            {
+                // make sure paddings exist for each c,h,w dimensions
+                for(unsigned int i = 0; i < 3 - pre_pad_layer.size(); ++i)
+                {
+                    pre_pad_layer.push_back({ 0, 0 });
+                }
+
+                // rotate padding info from nchw to nhwc
+                std::rotate(pre_pad_layer.begin(), pre_pad_layer.begin() + 2, pre_pad_layer.begin() + 3);
+            }
         }
 
         const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
@@ -186,17 +301,47 @@ protected:
         // Create tensors
         TensorType src     = create_tensor<TensorType>(input_shape, _data_type, 1, _quantization_info, _data_layout);
         TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _weights_data_type, 1, _weight_quantization_info, _data_layout);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, _quantization_info, _data_layout);
-        TensorType dst     = create_tensor<TensorType>(output_shape, _output_data_type, 1, _quantization_info, _data_layout);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, QuantizationInfo() /*bias is not a quantized type*/, _data_layout);
+        TensorType dst     = create_tensor<TensorType>(output_shape, _output_data_type, 1, _dst_q_info, _data_layout);
 
         // Create and configure function
         FunctionType conv;
-        detail::configure_conv_function(conv, &src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        const unsigned int height_index = arm_compute::graph::get_dimension_idx(_data_layout, DataLayoutDimension::HEIGHT);
+        const unsigned int width_index  = arm_compute::graph::get_dimension_idx(_data_layout, DataLayoutDimension::WIDTH);
+
+        const PaddingInfo pad_w = width_index < pre_pad_layer.size() ? pre_pad_layer[width_index] : PaddingInfo(0, 0);
+        const PaddingInfo pad_h = height_index < pre_pad_layer.size() ? pre_pad_layer[height_index] : PaddingInfo(0, 0);
+
+        if(pre_pad_layer.size() > 0 && arm_compute::graph::is_padding_in_height_or_width(_data_layout, pre_pad_layer))
+        {
+            // this is the logic implemented in NodeFusionMutator -> fuse_pad_with_convolution
+            const PadStrideInfo new_conv_info(
+                info.stride().first,
+                info.stride().second,
+                info.pad_left() + pad_w.first,
+                info.pad_right() + pad_w.second,
+                info.pad_top() + pad_h.first,
+                info.pad_bottom() + pad_h.second,
+                info.round());
+            detail::configure_conv_function(conv, &src, &weights, &bias, &dst, new_conv_info, weights_info, dilation, act_info, num_groups);
+        }
+        else
+        {
+            detail::configure_conv_function(conv, &src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups);
+        }
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+        // Test "add padding after configure" behavior. This behavior should not affect the correctness
+        add_padding_x({ &src, &bias, &dst }, _data_layout);
+        // Padding weights may affect code path in some backends
+        if (padded_weights)
+        {
+            add_padding_x({ &weights }, _data_layout);
+        }
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -204,24 +349,31 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
-        fill(AccessorType(src), 0);
-        fill(AccessorType(weights), 1);
-        fill(AccessorType(bias), 2);
+        fill(AccessorType(src), 0 + _hash);
+        fill(AccessorType(weights), 1 + _hash);
+        fill(AccessorType(bias), 2 + _hash);
 
-        // Compute NEConvolutionLayer function
-        conv.run();
+        if(_mixed_layout)
+        {
+            mix_layout(conv, src, dst);
+        }
+        else
+        {
+            // Compute Convolution function
+            conv.run();
+        }
 
         return dst;
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
-                                      const Size2D &dilation, const ActivationLayerInfo act_info)
+                                      const Size2D &dilation, const ActivationLayerInfo act_info, PaddingList pre_pad_layer = PaddingList({}))
     {
         ARM_COMPUTE_ERROR_ON((input_shape[2] % weights_shape[2]) != 0);
 
@@ -237,9 +389,9 @@ protected:
         SimpleTensor<TW>    weights{ weights_shape, weights_dt, 1, _weight_quantization_info };
         SimpleTensor<TBias> bias{ bias_shape, bias_dt, 1, _quantization_info };
 
-        fill(src, 0);
-        fill(weights, 1);
-        fill(bias, 2);
+        fill(src, 0 + _hash);
+        fill(weights, 1 + _hash);
+        fill(bias, 2 + _hash);
 
         // Fill with bfloat16 to perform the conversion and reduce the mismatches in the output
         if(_is_bfloat16)
@@ -248,9 +400,14 @@ protected:
             regularize_values(static_cast<void *>(weights.data()), weights.num_elements());
         }
 
-        return (act_info.enabled()) ? reference::activation_layer<T>(reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation, num_groups),
+        if(pre_pad_layer.size() > 0)
+        {
+            src = reference::pad_layer<T>(src, pre_pad_layer, PixelValue(0), PaddingMode::CONSTANT);
+        }
+
+        return (act_info.enabled()) ? reference::activation_layer<T>(reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation, num_groups, _dst_q_info),
                                                                      act_info) :
-               reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation, num_groups);
+               reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation, num_groups, _dst_q_info);
     }
 
     TensorType       _target{};
@@ -262,34 +419,63 @@ protected:
     DataLayout       _data_layout{};
     QuantizationInfo _quantization_info{};
     QuantizationInfo _weight_quantization_info{};
-    bool             _is_quantized = false;
+    QuantizationInfo _dst_q_info{};
     bool             _is_bfloat16  = false;
+    bool             _mixed_layout = false;
+    bool             _use_dynamic_output_quant{false};
+    int32_t          _hash{0};
+    int32_t          _min_bias{-100};
+    int32_t          _max_bias{100};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class ConvolutionValidationFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
                DataLayout data_layout, ActivationLayerInfo act_info)
     {
         ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
                                                                                                  data_type, data_type, data_layout,
-                                                                                                 QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                 QuantizationInfo(), QuantizationInfo(), act_info, mixed_layout);
     }
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
+class ConvolutionValidationPaddedWeightsFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
+               DataLayout data_layout)
+    {
+        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
+                                                                                                 data_type, data_type, data_layout,
+                                                                                                 QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), mixed_layout, PaddingList({}), true);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
+class ConvolutionValidationWithPaddingFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
+               DataLayout data_layout, ActivationLayerInfo act_info, PaddingList pre_pad_layer = PaddingList({}))
+    {
+        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
+                                                                                                 data_type, data_type, data_layout,
+                                                                                                 QuantizationInfo(), QuantizationInfo(), act_info, mixed_layout, pre_pad_layer);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class ConvolutionValidationQuantizedFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
                DataLayout data_layout, QuantizationInfo quantization_info, ActivationLayerInfo act_info)
     {
         ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
-                                                                                                 data_type, data_type, data_layout, quantization_info, quantization_info, act_info);
+                                                                                                 data_type, data_type, data_layout, quantization_info, quantization_info, act_info, mixed_layout);
     }
 };
 
@@ -297,7 +483,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ConvolutionValidationQuantizedPerChannelFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, TW>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
                DataLayout data_layout, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataType weights_data_type)
     {
@@ -313,7 +498,311 @@ public:
                                                                                                   quantization_info, QuantizationInfo(weights_scales), act_info);
     }
 };
+
+
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+inline TensorInfo prepare_weights(const TensorInfo tensor_info, const arm_compute::WeightFormat weight_format)
+{
+    const DataLayout data_layout = tensor_info.data_layout();
+    ARM_COMPUTE_EXPECT(data_layout == DataLayout::NHWC, framework::LogLevel::ERRORS);
+    const DataType    data_type    = tensor_info.data_type();
+    const TensorShape tensor_shape = tensor_info.tensor_shape();
+    const int         N            = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)]; // N=O
+    const int         H            = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)];
+    const int         W            = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)];
+    const int         C            = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)]; // C=I
+
+    const int interleave_by = arm_compute::interleave_by(weight_format);
+    const int block_by      = arm_compute::block_by(weight_format);
+    const int Ip            = arm_gemm::roundup<unsigned int>(C, block_by);      // C'=I'
+    const int Op            = arm_gemm::roundup<unsigned int>(N, interleave_by); // O'=N'
+
+    arm_compute::Strides strides_in_bytes = tensor_info.strides_in_bytes();
+    strides_in_bytes.set(1, Ip * interleave_by * H * W * tensor_info.element_size());
+    strides_in_bytes.set(2, Ip * Op * tensor_info.element_size());
+
+    const size_t offset_first_element_in_bytes = tensor_info.offset_first_element_in_bytes();
+
+    // Total size needs to include padded dimensions
+    const size_t total_size_in_bytes = Op * H * W * Ip * tensor_info.element_size();
+
+    const TensorShape TS(Ip, W, H, Op);
+
+    TensorInfo new_tensor_info = tensor_info;
+    new_tensor_info.init(TS, 1 /*num_channels, deprecated*/, data_type, strides_in_bytes,
+        offset_first_element_in_bytes, total_size_in_bytes);
+    return new_tensor_info;
+}
+
+template <typename ScalarType, typename AccessorType>
+inline void rearrange_data(const AccessorType src, AccessorType dst, const arm_compute::WeightFormat weight_format)
+{
+    ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format(weight_format), framework::LogLevel::ERRORS);
+    // Data Layout: OHWIo<interleave_by>i<block_by>
+    const int         interleave_by    = arm_compute::interleave_by(weight_format);
+    const int         block_by         = arm_compute::block_by(weight_format);
+    const TensorShape src_tensor_shape = src.shape();
+    const DataLayout  data_layout      = src.data_layout();
+    ARM_COMPUTE_EXPECT(data_layout == DataLayout::NHWC, framework::LogLevel::ERRORS);
+    const unsigned int O  = src_tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)]; // N=O
+    const unsigned int H  = src_tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)];
+    const unsigned int W  = src_tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)];
+    const unsigned int I  = src_tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)]; // C=I
+    const unsigned int Ip = arm_gemm::roundup<unsigned int>(I, block_by);                                                 // C'=I'
+    const unsigned int Op = arm_gemm::roundup<unsigned int>(O, interleave_by);                                            // N'=O'
+
+    ARM_COMPUTE_EXPECT_EQUAL(Op * H * W * Ip, (unsigned)dst.num_elements(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(src.num_elements() <= dst.num_elements(), framework::LogLevel::ERRORS);
+
+    const ScalarType *src_ptr = reinterpret_cast<const ScalarType *>(src.data());
+    ScalarType       *dst_ptr = reinterpret_cast<ScalarType *>(dst.data());
+    for(unsigned i = 0; i < I; ++i)
+        for(unsigned w = 0; w < W; ++w)
+            for(unsigned h = 0; h < H; ++h)
+                for(unsigned o = 0; o < O; ++o)
+                {
+                    ScalarType src_element;
+                    switch(data_layout)
+                    {
+                        case DataLayout::NHWC:
+                        {
+                            src_element = src_ptr[o * H * W * I + h * W * I + w * I + i];
+                        }
+                        break;
+                        default:
+                        {
+                            ARM_COMPUTE_ERROR("Unsupported memory layout.");
+                        }
+                    }
+                    const int x5      = std::floor(((float)o) / interleave_by);
+                    const int x4      = h;
+                    const int x3      = w;
+                    const int x2      = std::floor((float)i / block_by);
+                    const int x1      = o % interleave_by;
+                    const int x0      = i % block_by;
+                    unsigned  dst_idx = x5 * H * W * Ip * interleave_by
+                                        + x4 * W * Ip * interleave_by
+                                        + x3 * Ip * interleave_by
+                                        + x2 * interleave_by * block_by
+                                        + x1 * block_by
+                                        + x0;
+                    dst_ptr[dst_idx] = src_element;
+                }
+}
+
+template <typename ConvolutionFunction, typename TensorClass, typename AccessorType, typename ScalarType, bool enable_fast_math>
+class VariableWeightsFixtureBaseClass : public framework::Fixture
+{
+public:
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, DataLayout data_layout,
+               const DataType data_type)
+    {
+        conv = std::make_unique<ConvolutionFunction>();
+        // prepare data
+        _data_layout = data_layout;
+        // Fixed format kernels for variable weights can work only with NHWC format.
+        ARM_COMPUTE_EXPECT_EQUAL(_data_layout, DataLayout::NHWC, framework::LogLevel::ERRORS);
+        _data_type = data_type;
+        // run the code
+        compute_target(input_shape, weights_shape, bias_shape, output_shape, info, dilation);
+        compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation);
+    }
+    void teardown()
+    {
+        _target.allocator()->free();
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F16:
+            {
+                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+                library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+private:
+    virtual void configure_and_execute_kernel(TensorInfo src_tensor_info, TensorInfo weight_tensor_info, TensorInfo bias_tensor_info, TensorInfo dst_tensor_info, const WeightsInfo weights_info,
+                                              const PadStrideInfo &conv_info,
+                                              const Size2D        &dilation) = 0;
+
+    void compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &conv_info,
+                        const Size2D &dilation)
+    {
+        // The dataset is always in NCHW format - we need to make C the
+        // innermost dimension because the fixed-format kernel work only
+        // with NHWC layout.
+        permute(input_shape, PermutationVector(2U, 0U, 1U));
+        permute(weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(output_shape, PermutationVector(2U, 0U, 1U));
+        const auto src_tensor_info    = TensorInfo(input_shape, 1, _data_type, _data_layout);
+        const auto weight_tensor_info = TensorInfo(weights_shape, 1, _data_type, _data_layout);
+        const auto bias_tensor_info   = TensorInfo(bias_shape, 1, _data_type, _data_layout);
+        auto       dst_tensor_info    = TensorInfo(output_shape, 1, _data_type, _data_layout);
+
+        const int kernel_height = weights_shape[get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT)];
+        const int kernel_width  = weights_shape[get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH)];
+        const int num_kernels   = weights_shape[get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES)];
+
+        const WeightsInfo query_weights_info(/*reshape_weights*/ false, kernel_width, kernel_height, num_kernels, false, arm_compute::WeightFormat::ANY);
+        const bool        kernel_found = bool(ConvolutionFunction::has_opt_impl(_computed_weight_format, &src_tensor_info, &weight_tensor_info,
+                                                                                &bias_tensor_info, &dst_tensor_info, conv_info, query_weights_info));
+        // Make surethat the setup founds a fixed-format kernel as requested by the test case.
+        ARM_COMPUTE_EXPECT(kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format(_computed_weight_format), framework::LogLevel::ERRORS);
+
+        const WeightsInfo weights_info(/*reshape_weights*/ false, kernel_width, kernel_height, num_kernels, false, _computed_weight_format);
+        configure_and_execute_kernel(src_tensor_info, weight_tensor_info, bias_tensor_info, dst_tensor_info, weights_info, conv_info,
+                                     dilation);
+    }
+    void compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                           const Size2D &dilation)
+    {
+        ARM_COMPUTE_UNUSED(input_shape, weights_shape, bias_shape, output_shape, info,
+                           dilation);
+
+        // Create reference
+        SimpleTensor<ScalarType> src{ input_shape, _data_type };
+        SimpleTensor<ScalarType> weights{ weights_shape, _data_type };
+        SimpleTensor<ScalarType> bias{ bias_shape, _data_type };
+        fill(src, 0);
+        fill(bias, 1);
+        fill(weights, 3);
+        _reference = reference::convolution_layer<ScalarType>(src, weights, bias, output_shape, info, dilation, 1 /*num_groups*/);
+    }
+    DataLayout _data_layout{};
+    DataType   _data_type{};
+
+protected:
+    std::unique_ptr<ConvolutionFunction> conv{};
+    arm_compute::WeightFormat            _computed_weight_format{ arm_compute::WeightFormat::UNSPECIFIED };
+    TensorClass                          _target{};
+    SimpleTensor<ScalarType>             _reference{};
+};
+
+template <typename ConvolutionFunction, typename TensorClass, typename AccessorType, typename ScalarType, bool enable_fast_math>
+class VariableWeightsFixture : public VariableWeightsFixtureBaseClass<ConvolutionFunction, TensorClass, AccessorType, ScalarType, enable_fast_math>
+{
+    void configure_and_execute_kernel(TensorInfo src_tensor_info, TensorInfo weight_tensor_info, TensorInfo bias_tensor_info, TensorInfo dst_tensor_info, const WeightsInfo weights_info,
+                                      const PadStrideInfo &conv_info,
+                                      const Size2D        &dilation)
+    {
+        this->conv->configure(&src_tensor_info, &weight_tensor_info, &bias_tensor_info, &dst_tensor_info, conv_info, weights_info, dilation, ActivationLayerInfo(), enable_fast_math);
+
+        // Allocate input tensors
+        auto             src                 = create_tensor<TensorClass>(src_tensor_info);
+        auto             weights_original    = create_tensor<TensorClass>(weight_tensor_info);
+        const TensorInfo new_tensor_info     = prepare_weights(weight_tensor_info, this->_computed_weight_format);
+        auto             weights_transformed = create_tensor<TensorClass>(new_tensor_info);
+        auto             bias                = create_tensor<TensorClass>(bias_tensor_info);
+        src.allocator()->allocate();
+        weights_original.allocator()->allocate();
+        weights_transformed.allocator()->allocate();
+        bias.allocator()->allocate();
+        // Allocate destination tensor
+        this->_target = create_tensor<TensorClass>(dst_tensor_info);
+        this->_target.allocator()->allocate();
+
+        // Prepare source and biases that are left unchanged.
+        this->fill(AccessorType(src), 0);
+        this->fill(AccessorType(bias), 1);
+
+        // First run
+        this->fill(AccessorType(weights_original), 2);
+        rearrange_data<ScalarType, AccessorType>(AccessorType(weights_original), AccessorType(weights_transformed), this->_computed_weight_format);
+        ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weights_transformed }, { TensorType::ACL_SRC_2, &bias }, { TensorType::ACL_DST, &(this->_target) } };
+        this->conv->run(run_pack);
+        // Second run, with new weights
+        this->fill(AccessorType(weights_original), 3);
+        rearrange_data<ScalarType, AccessorType>(AccessorType(weights_original), AccessorType(weights_transformed), this->_computed_weight_format);
+        this->conv->run(run_pack);
+        src.allocator()->free();
+        weights_original.allocator()->free();
+        weights_transformed.allocator()->free();
+        bias.allocator()->free();
+    }
+};
+
+template <typename ConvolutionFunction, typename TensorClass, typename AccessorType, typename ScalarType, bool enable_fast_math>
+class VariableWeightsFixtureNEInterface : public VariableWeightsFixtureBaseClass<ConvolutionFunction, TensorClass, AccessorType, ScalarType, enable_fast_math>
+{
+    void configure_and_execute_kernel(TensorInfo src_tensor_info, TensorInfo weight_tensor_info, TensorInfo bias_tensor_info, TensorInfo dst_tensor_info, const WeightsInfo weights_info,
+                                      const PadStrideInfo &conv_info,
+                                      const Size2D        &dilation)
+    {
+        // Allocate input tensors
+        auto             src                 = create_tensor<TensorClass>(src_tensor_info);
+        auto             weights_original    = create_tensor<TensorClass>(weight_tensor_info);
+        const TensorInfo new_tensor_info     = prepare_weights(weight_tensor_info, this->_computed_weight_format);
+        auto             weights_transformed = create_tensor<TensorClass>(new_tensor_info);
+        auto             bias                = create_tensor<TensorClass>(bias_tensor_info);
+        src.allocator()->allocate();
+        weights_original.allocator()->allocate();
+        weights_transformed.allocator()->allocate();
+        bias.allocator()->allocate();
+        // Allocate destination tensor
+        this->_target = create_tensor<TensorClass>(dst_tensor_info);
+        this->_target.allocator()->allocate();
+        this->conv->configure(&src, &weights_transformed, &bias, &(this->_target), conv_info, weights_info, dilation, ActivationLayerInfo(), enable_fast_math);
+        // Prepare source and biases that are left unchanged.
+        this->fill(AccessorType(src), 0);
+        this->fill(AccessorType(bias), 1);
+
+        // First run
+        this->fill(AccessorType(weights_original), 2);
+        rearrange_data<ScalarType, AccessorType>(AccessorType(weights_original), AccessorType(weights_transformed), this->_computed_weight_format);
+        this->conv->run();
+        // Second run, with new weights
+        this->fill(AccessorType(weights_original), 3);
+        rearrange_data<ScalarType, AccessorType>(AccessorType(weights_original), AccessorType(weights_transformed), this->_computed_weight_format);
+        this->conv->run();
+        src.allocator()->free();
+        weights_original.allocator()->free();
+        weights_transformed.allocator()->free();
+        bias.allocator()->free();
+    }
+};
+
+template <typename ConvolutionClass, bool enable_fast_math>
+class HasOptImplFixture : public framework::Fixture
+{
+public:
+    void setup(DataType data_type, arm_compute::WeightFormat query_weight_format)
+    {
+        auto              conv        = std::make_unique<ConvolutionClass>();
+        const auto        src_info    = TensorInfo(TensorShape(56U, 56U, 64U), 1, data_type, DataLayout::NHWC);
+        const auto        weight_info = TensorInfo(TensorShape(64, 3U, 3U, 64U), 1, enable_fast_math ? DataType::BFLOAT16 : data_type, DataLayout::NHWC);
+        const auto        bias_info   = TensorInfo(TensorShape(64U), 1, data_type, DataLayout::NHWC);
+        auto              dst_info    = TensorInfo(TensorShape(56U, 56U, 64U), 1, data_type, DataLayout::NHWC);
+        const auto        conv_info   = PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR);
+        const WeightsInfo weights_info(false, 3U, 3U, 64U, false, query_weight_format);
+        _kernel_found = bool(ConvolutionClass::has_opt_impl(_computed_weight_format, &src_info, &weight_info,
+                                                            &bias_info, &dst_info, conv_info, weights_info,
+                                                            Size2D(1U, 1U) /*dilation*/, ActivationLayerInfo() /*act_info*/, enable_fast_math));
+    }
+
+protected:
+    bool                      _kernel_found{ false };
+    arm_compute::WeightFormat _computed_weight_format{ arm_compute::WeightFormat::UNSPECIFIED };
+};
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CONVOLUTION_LAYER_FIXTURE */
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_CONVOLUTIONLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/CopyFixture.h b/tests/validation/fixtures/CopyFixture.h
index feb1d7dfb5..f5e711a500 100644
--- a/tests/validation/fixtures/CopyFixture.h
+++ b/tests/validation/fixtures/CopyFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class CopyFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
     {
         _target    = compute_target(input_shape, output_shape, data_type);
@@ -61,7 +60,7 @@ protected:
     TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
     {
         // Check if indeed the input shape can be reshape to the output one
-        ARM_COMPUTE_EXPECT(input_shape.total_size() == output_shape.total_size(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(input_shape.total_size() == output_shape.total_size());
 
         // Create tensors
         TensorType src = create_tensor<TensorType>(input_shape, data_type);
@@ -72,15 +71,15 @@ protected:
 
         copy.configure(&src, &dst);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
diff --git a/tests/validation/fixtures/CropResizeFixture.h b/tests/validation/fixtures/CropResizeFixture.h
index 4f6389155a..30a3fd8569 100644
--- a/tests/validation/fixtures/CropResizeFixture.h
+++ b/tests/validation/fixtures/CropResizeFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,6 @@
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
-#include "tests/RawLutAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
@@ -47,7 +46,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class CropResizeFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape src_shape, TensorShape boxes_shape, Coordinates2D crop_size, InterpolationPolicy method,
                float extrapolation_value, bool is_outside_bounds, DataType data_type)
     {
@@ -88,15 +86,15 @@ protected:
         FunctionType crop;
         crop.configure(&src, &boxes, &boxes_ind, &dst, crop_size, method, extrapolation_value);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
diff --git a/tests/validation/fixtures/DeconvolutionLayerFixture.h b/tests/validation/fixtures/DeconvolutionLayerFixture.h
index 6ea2335ae9..83170c413c 100644
--- a/tests/validation/fixtures/DeconvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DeconvolutionLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,22 +42,24 @@ namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW>
 class DeconvolutionLayerFixtureBase : public framework::Fixture
 {
 public:
     using TBias = typename std::conditional < std::is_same<typename std::decay<T>::type, uint8_t>::value || std::is_same<typename std::decay<T>::type, int8_t>::value, int32_t, T >::type;
 
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
-               DataType data_type, DataLayout data_layout, QuantizationInfo input_quantization_info, QuantizationInfo output_quantization_info, bool add_bias)
+               DataType data_type, DataType weights_data_type, DataLayout data_layout,
+               QuantizationInfo input_quantization_info, QuantizationInfo output_quantization_info, QuantizationInfo weights_quantization_info, bool add_bias)
     {
-        _data_type                = data_type;
-        _bias_data_type           = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
-        _data_layout              = data_layout;
-        _input_quantization_info  = input_quantization_info;
-        _output_quantization_info = output_quantization_info;
+        _data_type                 = data_type;
+        _weights_data_type         = weights_data_type;
+        _bias_data_type            = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+        _data_layout               = data_layout;
+        _input_quantization_info   = input_quantization_info;
+        _output_quantization_info  = output_quantization_info;
+        _weights_quantization_info = weights_quantization_info;
 
         _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, add_bias);
         _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, add_bias);
@@ -72,14 +74,34 @@ protected:
             case DataType::QASYMM8:
             {
                 std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-                std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
+                std::uniform_int_distribution<uint32_t> distribution(bounds.first, bounds.second);
                 library->fill(tensor, distribution, i);
                 break;
             }
             case DataType::QASYMM8_SIGNED:
             {
                 std::pair<int, int> bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-                std::uniform_int_distribution<int8_t> distribution(bounds.first, bounds.second);
+                std::uniform_int_distribution<int32_t> distribution(bounds.first, bounds.second);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::QSYMM8_PER_CHANNEL:
+            {
+                int min_bound = 128;
+                int max_bound = -127;
+                for(size_t i = 0; i < _input_quantization_info.scale().size(); i++)
+                {
+                    std::pair<int, int> bounds = get_symm_quantized_per_channel_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                    if(bounds.first < min_bound)
+                    {
+                        min_bound = bounds.first;
+                    }
+                    if(bounds.second > max_bound)
+                    {
+                        max_bound = bounds.second;
+                    }
+                }
+                std::uniform_int_distribution<int32_t> distribution(min_bound, max_bound);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -113,8 +135,7 @@ protected:
         {
             case DataType::S32:
             {
-                const int32_t value = static_cast<int32_t>(tensor.quantization_info().uniform().offset);
-                library->fill_tensor_value(tensor, value);
+                library->fill_tensor_value(tensor, 0);
                 break;
             }
             case DataType::F16:
@@ -140,7 +161,7 @@ protected:
 
         // Create tensors
         TensorType src     = create_tensor<TensorType>(input_shape, _data_type, 1, _input_quantization_info, _data_layout);
-        TensorType weights = create_tensor<TensorType>(weights_shape, _data_type, 1, _input_quantization_info, _data_layout);
+        TensorType weights = create_tensor<TensorType>(weights_shape, _weights_data_type, 1, _weights_quantization_info, _data_layout);
         TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, _input_quantization_info, _data_layout);
         TensorType dst     = create_tensor<TensorType>(output_shape, _data_type, 1, _output_quantization_info, _data_layout);
 
@@ -148,13 +169,13 @@ protected:
         FunctionType conv;
         conv.configure(&src, &weights, add_bias ? &bias : nullptr, &dst, info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(weights.info()->is_resizable());
         if(add_bias)
         {
-            ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
         }
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -165,13 +186,13 @@ protected:
         }
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!weights.info()->is_resizable());
         if(add_bias)
         {
-            ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
         }
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
@@ -183,7 +204,6 @@ protected:
 
         // Compute DeconvolutionLayer function
         conv.run();
-
         return dst;
     }
 
@@ -192,7 +212,7 @@ protected:
     {
         // Create reference
         SimpleTensor<T>     src{ input_shape, _data_type, 1, _input_quantization_info };
-        SimpleTensor<T>     weights{ weights_shape, _data_type, 1, _input_quantization_info };
+        SimpleTensor<TW>    weights{ weights_shape, _weights_data_type, 1, _weights_quantization_info };
         SimpleTensor<TBias> bias{ bias_shape, _bias_data_type, 1, _input_quantization_info };
 
         // Fill reference
@@ -207,28 +227,27 @@ protected:
         {
             fill_zeros(bias);
         }
-
-        return reference::deconvolution_layer<T>(src, weights, bias, output_shape, info, _output_quantization_info);
+        return reference::deconvolution_layer<T, TW>(src, weights, bias, output_shape, info, _output_quantization_info);
     }
 
     TensorType       _target{};
     SimpleTensor<T>  _reference{};
     DataType         _data_type{};
+    DataType         _weights_data_type{};
     DataType         _bias_data_type{};
     DataLayout       _data_layout{};
     QuantizationInfo _input_quantization_info{};
     QuantizationInfo _output_quantization_info{};
+    QuantizationInfo _weights_quantization_info{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, unsigned int kernel_size_x, unsigned int kernel_size_y>
-class DeconvolutionValidationFixture : public DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T>
+class DeconvolutionValidationFixture : public DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, unsigned int sx, unsigned int sy, unsigned int padx, unsigned int pady,
                unsigned int num_kernels, DataType data_type, DataLayout data_layout, bool add_bias)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(kernel_size_x != kernel_size_y, "Only square kernels supported");
         const TensorShape   weights_shape(kernel_size_x, kernel_size_y, input_shape.z(), num_kernels);
         const TensorShape   bias_shape(num_kernels);
         const PadStrideInfo info(sx, sy, padx, pady, DimensionRoundingType::CEIL);
@@ -236,20 +255,18 @@ public:
         TensorInfo          input_info(input_shape, 1, data_type);
         TensorInfo          weights_info(weights_shape, 1, data_type);
         TensorShape         output_shape = compute_deconvolution_output_shape(out_dim, input_info, weights_info);
-        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, data_layout, QuantizationInfo(),
-                                                                                        QuantizationInfo(), add_bias);
+        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, data_type, data_layout, QuantizationInfo(),
+                                                                                           QuantizationInfo(), QuantizationInfo(), add_bias);
     }
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, unsigned int kernel_size_x, unsigned int kernel_size_y>
-class DeconvolutionValidationAsymmFixture : public DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T>
+class DeconvolutionValidationAsymmFixture : public DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, unsigned int sx, unsigned int sy, unsigned int pad_left, unsigned int pad_right, unsigned int pad_top,
                unsigned int pad_bottom, unsigned int num_kernels, DataType data_type, DataLayout data_layout, bool add_bias)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(kernel_size_x != kernel_size_y, "Only square kernels supported");
         const TensorShape   weights_shape(kernel_size_x, kernel_size_y, input_shape.z(), num_kernels);
         const TensorShape   bias_shape(num_kernels);
         const PadStrideInfo info(sx, sy, pad_left, pad_right, pad_top, pad_bottom, DimensionRoundingType::CEIL);
@@ -257,20 +274,18 @@ public:
         TensorInfo          input_info(input_shape, 1, data_type);
         TensorInfo          weights_info(weights_shape, 1, data_type);
         TensorShape         output_shape = compute_deconvolution_output_shape(out_dim, input_info, weights_info);
-        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, data_layout, QuantizationInfo(),
-                                                                                        QuantizationInfo(), add_bias);
+        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, data_type, data_layout, QuantizationInfo(),
+                                                                                           QuantizationInfo(), QuantizationInfo(), add_bias);
     }
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, unsigned int kernel_size_x, unsigned int kernel_size_y>
-class DeconvolutionValidationQuantizedFixture : public DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T>
+class DeconvolutionValidationQuantizedFixture : public DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, unsigned int sx, unsigned int sy, unsigned int padx, unsigned int pady,
                unsigned int num_kernels, DataType data_type, DataLayout data_layout, QuantizationInfo input_quantization_info, QuantizationInfo output_quantization_info, bool add_bias)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(kernel_size_x != kernel_size_y, "Only square kernels supported");
         const TensorShape   weights_shape(kernel_size_x, kernel_size_y, input_shape.z(), num_kernels);
         const TensorShape   bias_shape(num_kernels);
         const PadStrideInfo info(sx, sy, padx, pady, DimensionRoundingType::CEIL);
@@ -278,8 +293,38 @@ public:
         TensorInfo          input_info(input_shape, 1, data_type, input_quantization_info);
         TensorInfo          weights_info(weights_shape, 1, data_type, input_quantization_info);
         TensorShape         output_shape = compute_deconvolution_output_shape(out_dim, input_info, weights_info);
-        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, data_layout, input_quantization_info,
-                                                                                        output_quantization_info, add_bias);
+        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, data_type, data_layout,
+                                                                                           input_quantization_info,
+                                                                                           output_quantization_info, input_quantization_info, add_bias);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW, unsigned int kernel_size_x, unsigned int kernel_size_y>
+class DeconvolutionValidationQuantizedPerChannelFixture : public DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T, TW>
+{
+public:
+    void setup(TensorShape input_shape, unsigned int sx, unsigned int sy, unsigned int padx, unsigned int pady,
+               unsigned int num_kernels, DataType data_type, DataLayout data_layout, QuantizationInfo input_quantization_info, QuantizationInfo output_quantization_info, bool add_bias,
+               DataType weights_data_type)
+    {
+        const TensorShape   weights_shape(kernel_size_x, kernel_size_y, input_shape.z(), num_kernels);
+        const TensorShape   bias_shape(num_kernels);
+        const PadStrideInfo info(sx, sy, padx, pady, DimensionRoundingType::CEIL);
+        auto                out_dim = deconvolution_output_dimensions(input_shape.x(), input_shape.y(), kernel_size_x, kernel_size_y, info);
+        TensorInfo          input_info(input_shape, 1, data_type, input_quantization_info);
+        TensorInfo          weights_info(weights_shape, 1, weights_data_type, input_quantization_info);
+        TensorShape         output_shape = compute_deconvolution_output_shape(out_dim, input_info, weights_info);
+
+        std::vector<float>                    weights_scales{};
+        std::mt19937                          gen(library->seed());
+        std::uniform_real_distribution<float> dis(0.01f, 1.f);
+        for(size_t i = 0; i < output_shape[2]; ++i)
+        {
+            weights_scales.push_back(dis(gen));
+        }
+        DeconvolutionLayerFixtureBase<TensorType, AccessorType, FunctionType, T, TW>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, weights_data_type, data_layout,
+                                                                                            input_quantization_info,
+                                                                                            output_quantization_info, QuantizationInfo(weights_scales), add_bias);
     }
 };
 
diff --git a/tests/validation/fixtures/DepthConvertLayerFixture.h b/tests/validation/fixtures/DepthConvertLayerFixture.h
index 937a1a06a9..f55d20bf3e 100644
--- a/tests/validation/fixtures/DepthConvertLayerFixture.h
+++ b/tests/validation/fixtures/DepthConvertLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class DepthConvertLayerValidationBaseFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType dt_in, DataType dt_out, ConvertPolicy policy, uint32_t shift, QuantizationInfo quantization_info)
     {
         _shift             = shift;
@@ -61,13 +60,13 @@ protected:
         if(is_data_type_quantized(tensor.data_type()))
         {
             std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-            std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
+            std::uniform_int_distribution<uint32_t> distribution(bounds.first, bounds.second);
 
             library->fill(tensor, distribution, i);
         }
         else
         {
-            // When converting S32 to F16, both reference and Neon implementations are + or - infinity outside the F16 range.
+            // When converting S32 to F16, both reference and Compute Library implementations are + or - infinity outside the F16 range.
             if(dt_in == DataType::S32 && dt_out == DataType::F16)
             {
                 std::uniform_int_distribution<int32_t> distribution_s32(-65504, 65504);
@@ -90,15 +89,15 @@ protected:
         FunctionType depth_convert;
         depth_convert.configure(&src, &dst, policy, shift);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0, dt_in, dt_out);
@@ -130,7 +129,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class DepthConvertLayerValidationFixture : public DepthConvertLayerValidationBaseFixture<TensorType, AccessorType, FunctionType, T1, T2>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType dt_in, DataType dt_out, ConvertPolicy policy, uint32_t shift)
     {
         DepthConvertLayerValidationBaseFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, dt_in, dt_out, policy,
@@ -142,7 +140,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class DepthConvertLayerValidationQuantizedFixture : public DepthConvertLayerValidationBaseFixture<TensorType, AccessorType, FunctionType, T1, T2>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType dt_in, DataType dt_out, ConvertPolicy policy, uint32_t shift, QuantizationInfo quantization_info)
     {
         DepthConvertLayerValidationBaseFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, dt_in, dt_out, policy,
diff --git a/tests/validation/fixtures/DepthToSpaceLayerFixture.h b/tests/validation/fixtures/DepthToSpaceLayerFixture.h
index a254ba4322..abe3d8b22f 100644
--- a/tests/validation/fixtures/DepthToSpaceLayerFixture.h
+++ b/tests/validation/fixtures/DepthToSpaceLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class DepthToSpaceLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, int32_t block_shape, TensorShape output_shape, DataType data_type, DataLayout data_layout)
     {
         _target    = compute_target(input_shape, block_shape, output_shape, data_type, data_layout);
@@ -73,15 +72,15 @@ protected:
         FunctionType depth_to_space;
         depth_to_space.configure(&input, &output, block_shape);
 
-        ARM_COMPUTE_EXPECT(input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(output.info()->is_resizable());
 
         // Allocate tensors
         input.allocator()->allocate();
         output.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!output.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(input), 0);
diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index d9806b5c84..6e2e3a3846 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_DEPTHWISE_CONVOLUTION_FIXTURE
-#define ARM_COMPUTE_TEST_DEPTHWISE_CONVOLUTION_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DEPTHWISECONVOLUTIONLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DEPTHWISECONVOLUTIONLAYERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -38,6 +38,7 @@
 
 #include "utils/Utils.h"
 
+#include <cstdint>
 #include <random>
 
 namespace arm_compute
@@ -54,35 +55,83 @@ class DepthwiseConvolutionLayerValidationGenericFixture : public framework::Fixt
 public:
     using TBias = typename std::conditional < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int32_t, T >::type;
 
+    void setup_quantization(TensorShape input_shape, TensorShape weights_shape, QuantizationInfo &input_q_info,
+        QuantizationInfo &weights_q_info, DataType data_type)
+    {
+        ARM_COMPUTE_UNUSED(input_shape);
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + _hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        _input_quantization_info = QuantizationInfo(scale_lhs, offset_lhs);
+        _weights_quantization_info = QuantizationInfo(scale_rhs, offset_rhs);
+
+        QuantizationHint q_hint = suggest_conv_dst_q_info_and_bias(input_q_info, weights_q_info,
+            weights_shape.y() /* heights */, weights_shape.x() /* width */, 1 /* channels */,
+            data_type, 0.5f /* bias_fraction */);
+
+        _output_quantization_info = q_hint.q_info;
+        _min_bias = q_hint.bias_min;
+        _max_bias = q_hint.bias_max;
+    }
+
 public:
-    template <typename...>
     void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, Size2D dilation,
                unsigned int depth_multiplier, DataType input_data_type, DataType weights_data_type,
                QuantizationInfo input_quantization_info, QuantizationInfo weights_quantization_info, QuantizationInfo output_quantization_info,
-               DataLayout data_layout, ActivationLayerInfo act_info)
+               DataLayout data_layout, ActivationLayerInfo act_info, bool mixed_layout = false, bool in_place = false, bool run_twice = false)
     {
+        ARM_COMPUTE_ERROR_ON(mixed_layout && in_place);
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        _hash = in_shape[0] + in_shape[1] + in_shape[2] + in_shape[3] +
+            kernel_size.width + kernel_size.height + dilation.x() +
+            dilation.y() + pad_stride_info.pad_bottom() + pad_stride_info.pad_left() + pad_stride_info.pad_right() + pad_stride_info.pad_top();
+
+        _mixed_layout              = mixed_layout;
         _input_shape               = in_shape;
         _input_data_type           = input_data_type;
         _weights_data_type         = weights_data_type;
-        _input_quantization_info   = input_quantization_info;
-        _weights_quantization_info = weights_quantization_info;
-        _output_quantization_info  = output_quantization_info;
         _data_layout               = data_layout;
         _pad_stride_info           = pad_stride_info;
         _act_info                  = act_info;
         _depth_multiplier          = depth_multiplier;
         _dilation                  = dilation;
+        _in_place                  = in_place;
+        _run_twice                 = run_twice;
 
         _bias_data_type = is_data_type_quantized(_input_data_type) ? DataType::S32 : _input_data_type;
 
         _weights_shape = TensorShape(kernel_size.width, kernel_size.height);
 
-        const TensorInfo in_info(_input_shape, 1, _input_data_type);
-        const TensorInfo we_info(_weights_shape, 1, _weights_data_type);
-        _output_shape = compute_depthwise_convolution_shape(in_info, we_info, _pad_stride_info, _depth_multiplier, _dilation);
+        const TensorInfo      in_info(_input_shape, 1, _input_data_type);
+        const TensorInfo      we_info(_weights_shape, 1, _weights_data_type);
+        const ConvolutionInfo info{ _pad_stride_info, _depth_multiplier, _act_info, _dilation };
+        _output_shape = compute_depthwise_convolution_shape(in_info, we_info, info);
 
         _weights_shape.set(2, _output_shape.z());
         _biases_shape = TensorShape(_weights_shape[2]);
+
+        _input_quantization_info = input_quantization_info;
+        _weights_quantization_info = weights_quantization_info;
+        _output_quantization_info = output_quantization_info;
+
+        if(is_data_type_quantized(_input_data_type) && !is_data_type_quantized_symmetric(weights_data_type) && (!act_info.enabled() || act_info.activation() == ActivationFunction::IDENTITY))
+        {
+            setup_quantization(in_shape, _weights_shape, _input_quantization_info, _weights_quantization_info, _input_data_type);
+            _use_dynamic_output_quant = true;
+        }
     }
 
     void configure_target()
@@ -99,18 +148,33 @@ public:
         }
 
         // Create tensors
-        _src     = create_tensor<TensorType>(input_shape, _input_data_type, 1, _input_quantization_info, _data_layout);
-        _weights = create_tensor<TensorType>(weights_shape, _weights_data_type, 1, _weights_quantization_info, _data_layout);
-        _biases  = create_tensor<TensorType>(_biases_shape, _bias_data_type, 1, _input_quantization_info, _data_layout);
-        _target  = create_tensor<TensorType>(output_shape, _input_data_type, 1, _output_quantization_info, _data_layout);
+        _src                      = create_tensor<TensorType>(input_shape, _input_data_type, 1, _input_quantization_info, _data_layout);
+        _weights                  = create_tensor<TensorType>(weights_shape, _weights_data_type, 1, _weights_quantization_info, _data_layout);
+        if(_run_twice) {
+            _weights.info()->set_are_values_constant(false);
+        }
+        _biases                   = create_tensor<TensorType>(_biases_shape, _bias_data_type, 1, _input_quantization_info, _data_layout);
+        TensorType *target_to_use = nullptr;
+        if(!_in_place)
+        {
+            _target       = create_tensor<TensorType>(output_shape, _input_data_type, 1, _output_quantization_info, _data_layout);
+            target_to_use = &_target;
+        }
+
+        add_padding_x({ &_src, &_biases }, _data_layout);
+        add_padding_x({ &_weights }, _data_layout, true);
+        if(!_in_place)
+        {
+            add_padding_x({ &_target }, _data_layout);
+        }
 
         // Create Depthwise Convolution configure function
-        _dwc.configure(&_src, &_weights, &_biases, &_target, _pad_stride_info, _depth_multiplier, _act_info, _dilation);
+        _dwc.configure(&_src, &_weights, &_biases, target_to_use, _pad_stride_info, _depth_multiplier, _act_info, _dilation);
 
-        ARM_COMPUTE_EXPECT(_src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_biases.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_target.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(_src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(_weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(_biases.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(_target.info()->is_resizable());
     }
 
     void allocate_and_run_target()
@@ -119,20 +183,41 @@ public:
         _src.allocator()->allocate();
         _weights.allocator()->allocate();
         _biases.allocator()->allocate();
-        _target.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!_src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_biases.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_target.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!_src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!_weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!_biases.info()->is_resizable());
+
+        if(!_in_place)
+        {
+            _target.allocator()->allocate();
+            ARM_COMPUTE_ASSERT(!_target.info()->is_resizable());
+        }
 
         // Fill tensors
-        fill(AccessorType(_src), 0);
-        fill(AccessorType(_weights), 1);
-        fill(AccessorType(_biases), 2);
+        fill(AccessorType(_src), 0 + _hash);
+        fill(AccessorType(_weights), 1 + _hash);
+        fill(AccessorType(_biases), 2 + _hash);
+
+        // Run with variable input
+        if(_run_twice) {
+            _dwc.run();
+
+            // Fill tensors with a new seed
+            fill(AccessorType(_src), 3 + _hash);
+            fill(AccessorType(_weights), 4 + _hash);
+            fill(AccessorType(_biases), 5 + _hash);
+        }
 
-        // Compute function
-        _dwc.run();
+        if(_mixed_layout)
+        {
+            mix_layout(_dwc, _src, _target);
+        }
+        else
+        {
+            // Compute function
+            _dwc.run();
+        }
     }
 
     void compute_reference()
@@ -141,15 +226,41 @@ public:
         SimpleTensor<TW>    weights{ _weights_shape, _weights_data_type, 1, _weights_quantization_info };
         SimpleTensor<TBias> biases{ _biases_shape, _bias_data_type, 1, _input_quantization_info };
 
-        fill(src, 0);
-        fill(weights, 1);
-        fill(biases, 2);
+        fill(src, 0 + _hash);
+        fill(weights, 1 + _hash);
+        fill(biases, 2 + _hash);
+
+        if(_run_twice) {
+            SimpleTensor<T> depth_out = reference::depthwise_convolution(src, weights, biases, _output_shape, _pad_stride_info, _depth_multiplier, _dilation, _output_quantization_info);
+            if(_act_info.enabled()) {
+                reference::activation_layer<T>(depth_out, _act_info);
+            }
+
+            fill(src, 3 + _hash);
+            fill(weights, 4 + _hash);
+            fill(biases, 5 + _hash);
+        }
 
         SimpleTensor<T> depth_out = reference::depthwise_convolution(src, weights, biases, _output_shape, _pad_stride_info, _depth_multiplier, _dilation, _output_quantization_info);
         _reference                = (_act_info.enabled()) ? reference::activation_layer<T>(depth_out, _act_info) : depth_out;
     }
 
 protected:
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
+    {
+        ARM_COMPUTE_ERROR_ON(_in_place);
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+
+        // Compute Convolution function
+        layer.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(_data_layout);
+        dst.info()->set_data_layout(_data_layout);
+    }
+
     template <typename U>
     void fill(U &&tensor, int i)
     {
@@ -157,32 +268,77 @@ protected:
         {
             case DataType::QASYMM8:
             {
-                std::uniform_int_distribution<uint8_t> distribution(0, 10);
-                library->fill(tensor, distribution, i);
+                if(_use_dynamic_output_quant)
+                {
+                    std::uniform_int_distribution<int32_t> distribution(0, 255);
+                    library->fill(tensor, distribution, i);
+                }
+                else
+                {
+                    // Legacy initialization in case the output quantization info can't be reliably estimated
+                    std::pair<int, int>                     bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                    std::uniform_int_distribution<uint32_t> distribution(bounds.first, bounds.second);
+                    library->fill(tensor, distribution, i);
+                }
                 break;
             }
             case DataType::QASYMM8_SIGNED:
+            {
+                if(_use_dynamic_output_quant)
+                {
+                    std::uniform_int_distribution<int32_t> distribution(-128, 127);
+                    library->fill(tensor, distribution, i);
+                }
+                else
+                {
+                    // Legacy initialization in case the output quantization info can't be reliably estimated
+                    std::pair<int, int>                    bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                    std::uniform_int_distribution<int32_t> distribution(bounds.first, bounds.second);
+                    library->fill(tensor, distribution, i);
+                }
+                break;
+            }
             case DataType::QSYMM8_PER_CHANNEL:
             {
-                std::uniform_int_distribution<int8_t> distribution(-10, 10);
+                int min_bound = 128;
+                int max_bound = -127;
+                for(size_t i = 0; i < _weights_quantization_info.scale().size(); i++)
+                {
+                    std::pair<int, int> bounds = get_symm_quantized_per_channel_bounds(tensor.quantization_info(), -1.0f, 1.0f, i);
+                    if(bounds.first < min_bound)
+                    {
+                        min_bound = bounds.first;
+                    }
+                    if(bounds.second > max_bound)
+                    {
+                        max_bound = bounds.second;
+                    }
+                }
+                std::uniform_int_distribution<int32_t> distribution(min_bound, max_bound);
                 library->fill(tensor, distribution, i);
                 break;
             }
-            case DataType::F16:
+            case DataType::S32:
             {
-                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
+                std::uniform_int_distribution<int32_t> distribution(_min_bias, _max_bias);
                 library->fill(tensor, distribution, i);
                 break;
             }
-            case DataType::F32:
+            case DataType::BFLOAT16:
             {
-                std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+                arm_compute::utils::uniform_real_distribution_16bit<bfloat16> distribution{ -1.0f, 1.0f };
                 library->fill(tensor, distribution, i);
                 break;
             }
-            case DataType::S32:
+            case DataType::F16:
+            {
+                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F32:
             {
-                std::uniform_int_distribution<int32_t> distribution(-100, 100);
+                std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -214,19 +370,33 @@ protected:
     ActivationLayerInfo _act_info{};
     unsigned int        _depth_multiplier{};
     Size2D              _dilation{};
+    bool                _mixed_layout{ false };
+    bool                _in_place{ false };
+    bool                _run_twice{ false };
+    bool                _use_dynamic_output_quant{false};
+
+    int32_t _hash{0};
+    // Random initialization limits
+    // Default values are previously handcrafted limits
+    // that sould be used when we don't use dynamic quantization
+    int32_t _min_bias{-100};
+    int32_t _max_bias{100};
+    int32_t _min_u8{0};
+    int32_t _max_u8{50};
+    int32_t _min_s8{-25};
+    int32_t _max_s8{25};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false, bool in_place = false, bool run_twice = false>
 class DepthwiseConvolutionLayerValidationFixture : public DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, Size2D dilation, unsigned int depth_multiplier, DataType data_type, DataLayout data_layout,
                ActivationLayerInfo act_info)
     {
         DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(in_shape, kernel_size, pad_stride_info, dilation, depth_multiplier,
                                                                                                                data_type, data_type, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(),
-                                                                                                               data_layout, act_info);
+                                                                                                               data_layout, act_info, mixed_layout, in_place, run_twice);
     }
 };
 
@@ -234,7 +404,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class DepthwiseConvolutionLayerNativeValidationFixture : public DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
 {
 public:
-    template <typename...>
     void setup(size_t width, size_t height, size_t channel, size_t batch, Size2D kernel_size, size_t depth_multiplier, Size2D dilation, Size2D stride, bool padding_valid, DataType data_type,
                DataLayout data_layout)
     {
@@ -249,7 +418,7 @@ public:
 
         if(padding_valid)
         {
-            _conv_info = PadStrideInfo();
+            _conv_info = PadStrideInfo(stride.width, stride.height);
         }
         else
         {
@@ -274,13 +443,20 @@ public:
         _biases  = create_tensor<TensorType>(_biases_shape, _data_type, 1, QuantizationInfo(), _data_layout);
         _target  = create_tensor<TensorType>(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout);
 
-        // Create Depthwise Convolution configure function
-        _dwc.configure(&_src, &_weights, &_biases, &_target, _conv_info, _depth_multiplier, _dilation);
+        add_padding_x({ &_src, &_biases, &_target }, _data_layout);
+        add_padding_x({ &_weights }, _data_layout, true);
 
-        ARM_COMPUTE_EXPECT(_src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_biases.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_target.info()->is_resizable(), framework::LogLevel::ERRORS);
+        // Create Depthwise Convolution configure function
+        const ConvolutionInfo info
+        {
+            _conv_info, _depth_multiplier, ActivationLayerInfo(), _dilation
+        };
+        _dwc.configure(_src.info(), _weights.info(), _biases.info(), _target.info(), info);
+
+        ARM_COMPUTE_ASSERT(_src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(_weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(_biases.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(_target.info()->is_resizable());
     }
 
     void allocate_and_run_target()
@@ -291,18 +467,24 @@ public:
         _biases.allocator()->allocate();
         _target.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!_src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_biases.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_target.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!_src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!_weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!_biases.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!_target.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(_src), 0);
         fill(AccessorType(_weights), 1);
         fill(AccessorType(_biases), 2);
 
+        arm_compute::ITensorPack pack;
+        pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_0, &_src);
+        pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_1, &_weights);
+        pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_2, &_biases);
+        pack.add_tensor(arm_compute::TensorType::ACL_DST, &_target);
+
         // Compute function
-        _dwc.run();
+        _dwc.run(pack);
     }
 
     void compute_reference()
@@ -315,9 +497,9 @@ public:
         fill(weights, 1);
         fill(biases, 2);
 
-        const TensorShape dst_shape = compute_depthwise_convolution_shape(TensorInfo(_input_shape, 1, _data_type), TensorInfo(_weights_shape, 1, _data_type), _conv_info,
-                                                                          _depth_multiplier, _dilation);
-        _reference = reference::depthwise_convolution(src, weights, biases, dst_shape, _conv_info, _depth_multiplier, _dilation);
+        const ConvolutionInfo info{ _conv_info, _depth_multiplier, ActivationLayerInfo(), _dilation };
+        const TensorShape     dst_shape = compute_depthwise_convolution_shape(TensorInfo(_input_shape, 1, _data_type), TensorInfo(_weights_shape, 1, _data_type), info);
+        _reference                      = reference::depthwise_convolution(src, weights, biases, dst_shape, _conv_info, _depth_multiplier, _dilation);
     }
 
 protected:
@@ -355,20 +537,21 @@ protected:
     unsigned int  _depth_multiplier{};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool in_place = false>
 class DepthwiseConvolutionLayerNativeConfigurableValidationFixture : public DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
 {
 public:
-    template <typename...>
     void setup(size_t width, size_t height, size_t channel, size_t batch, Size2D kernel_size, size_t depth_multiplier, Size2D dilation, Size2D stride, bool padding_valid, DataType data_type,
-               DataLayout data_layout, const ActivationLayerInfo &act_info, unsigned int n0)
+               DataLayout data_layout, const ActivationLayerInfo &act_info, unsigned int n0, bool export_to_cl_image)
     {
-        _dilation         = dilation;
-        _depth_multiplier = depth_multiplier;
-        _data_type        = data_type;
-        _data_layout      = data_layout;
-        _act_info         = act_info;
-        _n0               = n0;
+        _dilation           = dilation;
+        _depth_multiplier   = depth_multiplier;
+        _data_type          = data_type;
+        _data_layout        = data_layout;
+        _act_info           = act_info;
+        _n0                 = n0;
+        _export_to_cl_image = export_to_cl_image;
+        _in_place           = in_place;
 
         _input_shape   = TensorShape(width, height, channel, batch);
         _weights_shape = TensorShape(kernel_size.width, kernel_size.height, channel * _depth_multiplier);
@@ -376,16 +559,29 @@ public:
 
         if(padding_valid)
         {
-            _conv_info = PadStrideInfo();
+            _conv_info = calculate_same_pad(_input_shape, _weights_shape, PadStrideInfo(stride.width, stride.height), DataLayout::NCHW, _dilation);
         }
         else
         {
-            _conv_info = calculate_same_pad(_input_shape, _weights_shape, PadStrideInfo(stride.width, stride.height), DataLayout::NCHW, _dilation);
+            _conv_info = PadStrideInfo(stride.width, stride.height);
         }
     }
 
     void configure_target()
     {
+#if defined(ARM_COMPUTE_OPENCL_ENABLED)
+        if(_export_to_cl_image)
+        {
+            _validate_output &= image2d_from_buffer_supported(CLKernelLibrary::get().get_device());
+            _validate_output &= (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) != 0);
+        }
+#endif // ARM_COMPUTE_OPENCL_ENABLED
+
+        if(!_validate_output)
+        {
+            return;
+        }
+
         TensorShape input_shape   = _input_shape;
         TensorShape weights_shape = _weights_shape;
 
@@ -396,50 +592,89 @@ public:
         }
 
         // Create tensors
-        _src     = create_tensor<TensorType>(input_shape, _data_type, 1, QuantizationInfo(), _data_layout);
-        _weights = create_tensor<TensorType>(weights_shape, _data_type, 1, QuantizationInfo(), _data_layout);
-        _biases  = create_tensor<TensorType>(_biases_shape, _data_type, 1, QuantizationInfo(), _data_layout);
-        _target  = create_tensor<TensorType>(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout);
+        _src                      = create_tensor<TensorType>(input_shape, _data_type, 1, QuantizationInfo(), _data_layout);
+        _weights                  = create_tensor<TensorType>(weights_shape, _data_type, 1, QuantizationInfo(), _data_layout);
+        _biases                   = create_tensor<TensorType>(_biases_shape, _data_type, 1, QuantizationInfo(), _data_layout);
+        TensorType *target_to_use = nullptr;
+        if(!_in_place)
+        {
+            _target       = create_tensor<TensorType>(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout);
+            target_to_use = &_target;
+        }
+
+        DWCComputeKernelInfo dwc_info;
+        dwc_info.n0                         = _n0;
+        dwc_info.m0                         = _conv_info.stride().first == 1 && _dilation.x() == 1 ? 8 : 1;
+        dwc_info.export_input_to_cl_image   = false;
+        dwc_info.export_weights_to_cl_image = _export_to_cl_image;
 
-        DWCWeightsKernelInfo dwc_weights_info;
-        dwc_weights_info.n0 = _n0;
+        const ConvolutionInfo conv_kernel_info
+        {
+            _conv_info, _depth_multiplier, _act_info, _dilation
+        };
 
-        DWCKernelInfo dwc_info;
-        dwc_info.activation_info = _act_info;
+        add_padding_x({ &_src, &_biases, &_target }, _data_layout);
+        add_padding_x({ &_weights }, _data_layout, _export_to_cl_image); // Don't add left padding if cl image will be used
 
         // Create Depthwise Convolution configure function
-        _dwc.configure(&_src, &_weights, &_biases, &_target, dwc_weights_info, dwc_info, _conv_info, _depth_multiplier, _dilation);
+        _dwc.configure(&_src, &_weights, &_biases, target_to_use, dwc_info, conv_kernel_info);
 
-        ARM_COMPUTE_EXPECT(_src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_biases.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_target.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(_src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(_weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(_biases.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(_target.info()->is_resizable());
     }
 
     void allocate_and_run_target()
     {
+        if(!_validate_output)
+        {
+            return;
+        }
+
         // Allocate tensors
         _src.allocator()->allocate();
         _weights.allocator()->allocate();
         _biases.allocator()->allocate();
-        _target.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!_src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_biases.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_target.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!_src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!_weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!_biases.info()->is_resizable());
+        if(!_in_place)
+        {
+            _target.allocator()->allocate();
+            ARM_COMPUTE_ASSERT(!_target.info()->is_resizable());
+        }
 
         // Fill tensors
         fill(AccessorType(_src), 0);
         fill(AccessorType(_weights), 1);
         fill(AccessorType(_biases), 2);
 
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        _src.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        if(!_in_place)
+        {
+            _target.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        }
+
         // Compute function
         _dwc.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        if(!_in_place)
+        {
+            _target.info()->set_data_layout(_data_layout);
+        }
     }
 
     void compute_reference()
     {
+        if(!_validate_output)
+        {
+            return;
+        }
+
         SimpleTensor<T> src{ _input_shape, _data_type };
         SimpleTensor<T> weights{ _weights_shape, _data_type };
         SimpleTensor<T> biases{ _biases_shape, _data_type };
@@ -448,9 +683,9 @@ public:
         fill(weights, 1);
         fill(biases, 2);
 
-        const TensorShape dst_shape = compute_depthwise_convolution_shape(TensorInfo(_input_shape, 1, _data_type), TensorInfo(_weights_shape, 1, _data_type), _conv_info,
-                                                                          _depth_multiplier, _dilation);
-        _reference = reference::activation_layer(reference::depthwise_convolution(src, weights, biases, dst_shape, _conv_info, _depth_multiplier, _dilation), _act_info);
+        const ConvolutionInfo info{ _conv_info, _depth_multiplier, _act_info, _dilation };
+        const TensorShape     dst_shape = compute_depthwise_convolution_shape(TensorInfo(_input_shape, 1, _data_type), TensorInfo(_weights_shape, 1, _data_type), info);
+        _reference                      = reference::activation_layer(reference::depthwise_convolution(src, weights, biases, dst_shape, _conv_info, _depth_multiplier, _dilation), _act_info);
     }
 
 protected:
@@ -494,27 +729,28 @@ protected:
     Size2D              _dilation{};
     unsigned int        _depth_multiplier{};
     unsigned int        _n0{};
+    bool                _export_to_cl_image{};
+    bool                _validate_output{ true };
+    bool                _in_place{ false };
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false, bool in_place = false>
 class DepthwiseConvolutionLayerValidationQuantizedFixture : public DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, Size2D dilation, unsigned int depth_multiplier, DataType data_type,
                QuantizationInfo input_quantization_info, QuantizationInfo output_quantization_info, DataLayout data_layout, ActivationLayerInfo act_info)
     {
         DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(in_shape, kernel_size, pad_stride_info, dilation, depth_multiplier, data_type,
                                                                                                                data_type, input_quantization_info, input_quantization_info, output_quantization_info,
-                                                                                                               data_layout, act_info);
+                                                                                                               data_layout, act_info, mixed_layout, in_place);
     }
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW, bool in_place = false>
 class DepthwiseConvolutionLayerValidationQuantizedPerChannelFixture : public DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, TW>
 {
 public:
-    template <typename...>
     void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, Size2D dilation, unsigned int depth_multiplier, DataType input_data_type, DataType weights_data_type,
                QuantizationInfo input_quantization_info, QuantizationInfo output_quantization_info, DataLayout data_layout, ActivationLayerInfo act_info)
     {
@@ -532,10 +768,10 @@ public:
         DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T, TW>::setup(in_shape, kernel_size, pad_stride_info, dilation, depth_multiplier,
                                                                                                                 input_data_type, weights_data_type,
                                                                                                                 input_quantization_info, QuantizationInfo(weights_scales), output_quantization_info,
-                                                                                                                data_layout, act_info);
+                                                                                                                data_layout, act_info, false, in_place);
     }
 };
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DEPTHWISE_CONVOLUTION_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DEPTHWISECONVOLUTIONLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/DequantizationLayerFixture.h b/tests/validation/fixtures/DequantizationLayerFixture.h
index 1c1f46a64c..4eb25a5bc5 100644
--- a/tests/validation/fixtures/DequantizationLayerFixture.h
+++ b/tests/validation/fixtures/DequantizationLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class DequantizationValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType src_data_type, DataType dst_datatype, DataLayout data_layout)
     {
         _quantization_info = generate_quantization_info(src_data_type, shape.z());
@@ -77,15 +76,15 @@ protected:
         FunctionType dequantization_layer;
         dequantization_layer.configure(&src, &dst);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/DirectConvolution3DFixture.h b/tests/validation/fixtures/DirectConvolution3DFixture.h
new file mode 100644
index 0000000000..e80ad2f54f
--- /dev/null
+++ b/tests/validation/fixtures/DirectConvolution3DFixture.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTION3DFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTION3DFIXTURE_H
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/framework/Asserts.h" // Required for ARM_COMPUTE_ASSERT
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/Conv3D.h"
+
+#include <random>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using namespace arm_compute::misc::shape_calculator;
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DirectConvolution3DValidationGenericFixture : public framework::Fixture
+{
+public:
+    using TBias = typename std::conditional < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int32_t, T >::type;
+
+    void setup(const TensorShape &input_shape, int stride_x, int stride_y, int stride_z, int pad_x, int pad_y, int pad_z, unsigned int kernel_width, int kernel_height, int kernel_depth,
+               unsigned int num_kernels, bool has_bias, const ActivationLayerInfo &act_info, const DataType &data_type, const DataLayout &data_layout,
+               const QuantizationInfo &src_qinfo = QuantizationInfo(), const QuantizationInfo &weights_qinfo = QuantizationInfo(), const QuantizationInfo &dst_qinfo = QuantizationInfo())
+    {
+        ARM_COMPUTE_ERROR_ON(data_layout != DataLayout::NDHWC);
+
+        const TensorShape weights_shape(num_kernels, input_shape[0], kernel_width, kernel_height, kernel_depth);
+        const TensorShape bias_shape(num_kernels);
+        const DataType    bias_data_type = is_data_type_quantized(data_type) ? DataType::S32 : data_type;
+        const Conv3dInfo  conv3d_info(Size3D(stride_x, stride_y, stride_z), Padding3D(pad_x, pad_y, pad_z), act_info, Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false);
+        const TensorShape output_shape = compute_conv3d_shape(input_shape, weights_shape, conv3d_info);
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, conv3d_info, has_bias, data_type, bias_data_type, data_layout, src_qinfo, weights_qinfo, dst_qinfo);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, conv3d_info, has_bias, data_type, bias_data_type, src_qinfo, weights_qinfo, dst_qinfo);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F16:
+            {
+                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+                library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const Conv3dInfo &conv3d_info,
+                              bool has_bias, const DataType &data_type, const DataType &bias_data_type, const DataLayout &data_layout, const QuantizationInfo &src_qinfo,
+                              const QuantizationInfo &weights_qinfo, const QuantizationInfo &dst_qinfo)
+    {
+        // Create tensors
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, src_qinfo, data_layout);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, weights_qinfo, data_layout);
+        TensorType bias    = has_bias ? create_tensor<TensorType>(bias_shape, bias_data_type, 1, QuantizationInfo()) : TensorType();
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, dst_qinfo, data_layout);
+
+        // Create and configure function
+        FunctionType conv{};
+        conv.configure(&src, &weights, has_bias ? &bias : nullptr, &dst, conv3d_info);
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        weights.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        fill(AccessorType(src), 0);
+        fill(AccessorType(weights), 1);
+
+        if(has_bias)
+        {
+            ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+            bias.allocator()->allocate();
+            ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+            fill(AccessorType(bias), 2);
+        }
+
+        // Compute Direct Convolution 3D function
+        conv.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape,
+                                      const Conv3dInfo &conv3d_info, bool has_bias, const DataType &data_type, const DataType &bias_data_type, const QuantizationInfo &src_qinfo,
+                                      const QuantizationInfo &weights_qinfo, const QuantizationInfo &dst_qinfo)
+    {
+        // Create reference
+        SimpleTensor<T>     src{ input_shape, data_type, 1, src_qinfo };
+        SimpleTensor<T>     weights{ weights_shape, data_type, 1, weights_qinfo };
+        SimpleTensor<TBias> bias{ bias_shape, bias_data_type };
+        SimpleTensor<T>     dst{ output_shape, data_type, 1, dst_qinfo };
+
+        // Fill reference
+        fill(src, 0);
+        fill(weights, 1);
+
+        if(has_bias)
+        {
+            fill(bias, 2);
+        }
+
+        return reference::activation_layer(reference::conv3d<T, TBias>(src, weights, bias, dst, conv3d_info), conv3d_info.act_info);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DirectConvolution3DValidationFixture : public DirectConvolution3DValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, int stride_x, int stride_y, int stride_z, int pad_x, int pad_y, int pad_z, unsigned int kernel_width, int kernel_height, int kernel_depth,
+               unsigned int num_kernels, bool has_bias, ActivationLayerInfo act_info, DataType data_type, DataLayout data_layout)
+    {
+        DirectConvolution3DValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, stride_z, pad_x, pad_y, pad_z, kernel_width, kernel_height,
+                                                                                                      kernel_depth, num_kernels, has_bias, act_info, data_type, data_layout);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DirectConvolution3DValidationQuantizedFixture : public DirectConvolution3DValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, int stride_x, int stride_y, int stride_z, int pad_x, int pad_y, int pad_z, unsigned int kernel_width, int kernel_height, int kernel_depth,
+               unsigned int num_kernels, bool has_bias, ActivationLayerInfo act_info, DataType data_type, DataLayout data_layout, QuantizationInfo src_qinfo, QuantizationInfo weights_qinfo,
+               QuantizationInfo dst_qinfo)
+    {
+        DirectConvolution3DValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, stride_z, pad_x, pad_y, pad_z, kernel_width, kernel_height,
+                                                                                                      kernel_depth, num_kernels, has_bias, act_info, data_type, data_layout, src_qinfo,
+                                                                                                      weights_qinfo, dst_qinfo);
+    }
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTION3DFIXTURE_H
diff --git a/tests/validation/fixtures/DirectConvolutionLayerFixture.h b/tests/validation/fixtures/DirectConvolutionLayerFixture.h
index 8e4de77535..6f204642ca 100644
--- a/tests/validation/fixtures/DirectConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DirectConvolutionLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,6 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTIONLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTIONLAYERFIXTURE_H
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -51,12 +55,54 @@ class DirectConvolutionValidationGenericFixture : public framework::Fixture
 public:
     using TBias = typename std::conditional < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int32_t, T >::type;
 
-    template <typename...>
+    void setup_quantization(const TensorShape &input_shape, const TensorShape &weights_shape, QuantizationInfo &input_q_info,
+        QuantizationInfo &weights_q_info, DataType data_type)
+    {
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + _hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        input_q_info = QuantizationInfo(scale_lhs, offset_lhs);
+        weights_q_info = QuantizationInfo(scale_rhs, offset_rhs);
+
+        QuantizationHint q_hint = suggest_conv_dst_q_info_and_bias(input_q_info, weights_q_info,
+            weights_shape.y() /* heights */, weights_shape.x() /* width */, input_shape.z() /* channels */,
+            data_type, 0.5f /* bias_fraction */);
+
+        _dst_q_info = q_hint.q_info;
+        _min_bias = q_hint.bias_min;
+        _max_bias = q_hint.bias_max;
+
+        // Do not change here as these limits are the natural limits of the associated data types and
+        // are embeded in the computation of the dst quantization info.
+        _min_u8 = 0;
+        _max_u8 = 255;
+        _min_s8 = -128;
+        _max_s8 = 127;
+    }
+
     void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels,
-               DataType data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataLayout data_layout)
+               DataType data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataLayout data_layout, bool mixed_layout = false)
     {
-        _quantization_info = quantization_info;
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        _hash = input_shape[0] + input_shape[1] + input_shape[2] + input_shape[3] +
+                stride_x + stride_y + pad_x + pad_y + kernel_size + num_kernels + mixed_layout
+                + (data_layout == DataLayout::NHWC);
+
         _data_type         = data_type;
+        _mixed_layout      = mixed_layout;
 
         TensorShape         weights_shape(kernel_size, kernel_size, input_shape.z(), num_kernels);
         const TensorShape   bias_shape(num_kernels);
@@ -68,27 +114,66 @@ public:
 
         const TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, info);
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info, act_info, data_layout);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info, act_info);
+        QuantizationInfo input_q_info = quantization_info;
+        QuantizationInfo weights_q_info = quantization_info;
+        _dst_q_info = quantization_info;
+
+        if(is_data_type_quantized(data_type) && (!act_info.enabled() || act_info.activation() == ActivationFunction::IDENTITY))
+        {
+            setup_quantization(input_shape, weights_shape, input_q_info, weights_q_info, data_type);
+        }
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, input_q_info, weights_q_info, act_info, data_layout);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, input_q_info, weights_q_info, act_info);
     }
 
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
                DataType data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataLayout data_layout)
     {
         ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
         ARM_COMPUTE_UNUSED(dilation);
 
-        _quantization_info = quantization_info;
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        _hash = input_shape[0] + input_shape[1] + input_shape[2] + input_shape[3] +
+            weights_shape[0] + weights_shape[1] + weights_shape[2] + weights_shape[3] + dilation.x() +
+            dilation.y() + info.pad_bottom() + info.pad_left() + info.pad_right() + info.pad_top();
+
         _data_type         = data_type;
 
         const DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info, act_info, data_layout);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info, act_info);
+        QuantizationInfo input_q_info = quantization_info;
+        QuantizationInfo weights_q_info = quantization_info;
+        _dst_q_info = quantization_info;
+
+        if(is_data_type_quantized(data_type) && (!act_info.enabled() || act_info.activation() == ActivationFunction::IDENTITY))
+        {
+            setup_quantization(input_shape, weights_shape, input_q_info, weights_q_info, data_type);
+        }
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, input_q_info, weights_q_info, act_info, data_layout);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, input_q_info, weights_q_info, act_info);
     }
 
 protected:
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
+    {
+        DataLayout data_layout = src.info()->data_layout();
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+
+        // Compute Convolution function
+        layer.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(data_layout);
+        dst.info()->set_data_layout(data_layout);
+    }
+
     template <typename U>
     void fill(U &&tensor, int i)
     {
@@ -96,14 +181,14 @@ protected:
         {
             case DataType::QASYMM8:
             {
-                std::uniform_int_distribution<uint8_t> distribution(0, 50);
+                std::uniform_int_distribution<uint32_t> distribution(_min_u8, _max_u8);
                 library->fill(tensor, distribution, i);
                 break;
             }
             case DataType::QASYMM8_SIGNED:
             {
                 // Use small input range to avoid all the test results being saturated at the end.
-                std::uniform_int_distribution<int8_t> distribution(-25, 25);
+                std::uniform_int_distribution<int32_t> distribution(_min_s8, _max_s8);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -121,7 +206,7 @@ protected:
             }
             case DataType::S32:
             {
-                std::uniform_int_distribution<int32_t> distribution(-5, 5);
+                std::uniform_int_distribution<int32_t> distribution(_min_bias, _max_bias);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -131,7 +216,7 @@ protected:
     }
 
     TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &info,
-                              DataType data_type, DataType bias_data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info, const DataLayout &data_layout)
+                              DataType data_type, DataType bias_data_type, QuantizationInfo input_q_info, QuantizationInfo weights_q_info, ActivationLayerInfo act_info, const DataLayout &data_layout)
     {
         if(data_layout == DataLayout::NHWC)
         {
@@ -141,19 +226,22 @@ protected:
         }
 
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, quantization_info, data_layout);
-        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, quantization_info, data_layout);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, bias_data_type, 1, quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, quantization_info, data_layout);
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, input_q_info, data_layout);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, weights_q_info, data_layout);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, bias_data_type, 1, QuantizationInfo());
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, _dst_q_info, data_layout);
+
+        add_padding_x({ &src, &bias, &dst }, data_layout);
+        add_padding_x({ &weights }, data_layout, input_shape[0] % 4 == 0); // Don't add left padding if cl image will be used
 
         // Create and configure function
         FunctionType conv;
         conv.configure(&src, &weights, &bias, &dst, info, act_info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -161,67 +249,86 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
-        fill(AccessorType(src), 0);
-        fill(AccessorType(weights), 1);
-        fill(AccessorType(bias), 2);
+        fill(AccessorType(src), 0 + _hash);
+        fill(AccessorType(weights), 1 + _hash);
+        fill(AccessorType(bias), 2 + _hash);
 
-        // Compute NEConvolutionLayer function
-        conv.run();
+        if(_mixed_layout)
+        {
+            mix_layout(conv, src, dst);
+        }
+        else
+        {
+            // Compute Convolution function
+            conv.run();
+        }
 
         return dst;
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
-                                      DataType data_type, DataType bias_data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info)
+                                      DataType data_type, DataType bias_data_type, QuantizationInfo input_q_info, QuantizationInfo weights_q_info, ActivationLayerInfo act_info)
     {
         // Create reference
-        SimpleTensor<T>     src{ input_shape, data_type, 1, quantization_info };
-        SimpleTensor<T>     weights{ weights_shape, data_type, 1, quantization_info };
-        SimpleTensor<TBias> bias{ bias_shape, bias_data_type, 1, quantization_info };
+        SimpleTensor<T>     src{ input_shape, data_type, 1, input_q_info };
+        SimpleTensor<T>     weights{ weights_shape, data_type, 1, weights_q_info };
+        SimpleTensor<TBias> bias{ bias_shape, bias_data_type, 1, QuantizationInfo() };
 
         // Fill reference
-        fill(src, 0);
-        fill(weights, 1);
-        fill(bias, 2);
+        fill(src, 0 + _hash);
+        fill(weights, 1 + _hash);
+        fill(bias, 2 + _hash);
 
-        SimpleTensor<T> dst = reference::convolution_layer<T>(src, weights, bias, output_shape, info);
-        return (act_info.enabled()) ? reference::activation_layer<T>(dst, act_info) : dst;
+        SimpleTensor<T> dst = reference::convolution_layer<T>(src, weights, bias, output_shape, info,
+            Size2D(1U, 1U) /* dilation */, 1 /* num_groups */, _dst_q_info);
+        SimpleTensor<T> dst2 = (act_info.enabled()) ? reference::activation_layer<T>(dst, act_info) : dst;
+        return dst2;
     }
     TensorType       _target{};
     SimpleTensor<T>  _reference{};
-    QuantizationInfo _quantization_info{};
+    QuantizationInfo _dst_q_info{};
     DataType         _data_type{};
+    bool             _mixed_layout{ false };
+    int32_t _hash{0};
+
+    // Random initialization limits
+    // Default values are previously handcrafted limits
+    // that sould be used when we don't use dynamic quantization
+    int32_t _min_bias{-5};
+    int32_t _max_bias{5};
+    int32_t _min_u8{0};
+    int32_t _max_u8{50};
+    int32_t _min_s8{-25};
+    int32_t _max_s8{25};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class DirectConvolutionValidationFixture : public DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type, ActivationLayerInfo act_info,
                DataLayout data_layout)
     {
         DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, QuantizationInfo(),
-                                                                                                    act_info, data_layout);
+                                                                                                    act_info, data_layout, mixed_layout);
     }
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class DirectConvolutionValidationQuantizedFixture : public DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type, QuantizationInfo quantization_info,
                ActivationLayerInfo act_info, DataLayout data_layout)
     {
         DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, quantization_info,
-                                                                                                    act_info, data_layout);
+                                                                                                    act_info, data_layout, mixed_layout);
     }
 };
 
@@ -229,7 +336,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class DirectConvolutionValidationWithTensorShapesQuantizedFixture : public DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
                DataType data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataLayout data_layout)
     {
@@ -242,7 +348,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class DirectConvolutionValidationWithTensorShapesFixture : public DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
                DataType data_type, ActivationLayerInfo act_info)
     {
@@ -254,3 +359,5 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DIRECTCONVOLUTIONLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h b/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h
deleted file mode 100644
index 6ef30d3c21..0000000000
--- a/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/fixtures/ConvolutionLayerFixture.h"
-#include "tests/validation/reference/ConvolutionLayer.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class DirectConvolutionValidationGenericTensorShiftFixture : public framework::Fixture
-{
-public:
-    using TBias = typename std::conditional<std::is_same<typename std::decay<T>::type, uint8_t>::value, int32_t, T>::type;
-
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels,
-               DataType data_type, QuantizationInfo quantization_info)
-    {
-        _quantization_info = quantization_info;
-        _data_type         = data_type;
-
-        const TensorShape   weights_shape(kernel_size, kernel_size, input_shape.z(), num_kernels);
-        const TensorShape   bias_shape(num_kernels);
-        const PadStrideInfo info(stride_x, stride_y, pad_x, pad_y, DimensionRoundingType::FLOOR);
-        const TensorShape   output_shape   = get_output_shape(input_shape, weights_shape, info);
-        const DataType      bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
-
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info);
-    }
-
-    template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, unsigned int dilation_x, unsigned int dilation_y,
-               DataType data_type, QuantizationInfo quantization_info)
-    {
-        ARM_COMPUTE_UNUSED(dilation_x, dilation_y);
-
-        _quantization_info = quantization_info;
-        _data_type         = data_type;
-
-        const DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
-
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, quantization_info);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        switch(tensor.data_type())
-        {
-            case DataType::QASYMM8:
-            {
-                std::uniform_int_distribution<uint8_t> distribution(0, 50);
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            case DataType::F16:
-            {
-                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            case DataType::F32:
-            {
-                std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            case DataType::S32:
-            {
-                std::uniform_int_distribution<int32_t> distribution(-5, 5);
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            default:
-                library->fill_tensor_uniform(tensor, i);
-        }
-    }
-
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
-                              DataType data_type, DataType bias_data_type, QuantizationInfo quantization_info)
-    {
-        // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, quantization_info);
-        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, quantization_info);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, bias_data_type, 1, quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, quantization_info);
-
-        TensorShape output_shape1 = get_output_shape(output_shape, weights_shape, info);
-        TensorType  dst1          = create_tensor<TensorType>(output_shape1, data_type, 1, quantization_info);
-
-        // Create and configure function
-        FunctionType conv;
-        conv.configure(&src, &weights, &bias, &dst, info);
-        FunctionType conv1;
-        conv1.configure(&dst, &weights, &bias, &dst1, info);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst1.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        weights.allocator()->allocate();
-        bias.allocator()->allocate();
-        dst.allocator()->allocate();
-        dst1.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst1.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src), 0);
-        fill(AccessorType(weights), 1);
-        fill(AccessorType(bias), 2);
-
-        // Compute NEConvolutionLayer function
-        GCScheduler::get().memory_barrier();
-        conv.run();
-        GCScheduler::get().memory_barrier();
-        conv1.run();
-
-        return dst1;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
-                                      DataType data_type, DataType bias_data_type, QuantizationInfo quantization_info)
-    {
-        // Create reference
-        SimpleTensor<T>     src{ input_shape, data_type, 1, quantization_info };
-        SimpleTensor<T>     weights{ weights_shape, data_type, 1, quantization_info };
-        SimpleTensor<TBias> bias{ bias_shape, bias_data_type, 1, quantization_info };
-
-        SimpleTensor<T> dst{ output_shape, data_type, 1, quantization_info };
-        TensorShape     output_shape1 = get_output_shape(output_shape, weights_shape, info);
-
-        // Fill reference
-        fill(src, 0);
-        fill(weights, 1);
-        fill(bias, 2);
-
-        dst = reference::convolution_layer<T>(src, weights, bias, output_shape, info);
-        return reference::convolution_layer<T>(dst, weights, bias, output_shape1, info);
-    }
-
-    TensorType       _target{};
-    SimpleTensor<T>  _reference{};
-    QuantizationInfo _quantization_info{};
-    DataType         _data_type{};
-
-private:
-    TensorShape get_output_shape(TensorShape in_shape, TensorShape kernel_shape, const PadStrideInfo &info)
-    {
-        TensorShape out_shape(in_shape);
-        const std::pair<unsigned int, unsigned int> scaled_dims = scaled_dimensions(in_shape.x(),
-                                                                                    in_shape.y(),
-                                                                                    kernel_shape.x(),
-                                                                                    kernel_shape.y(),
-                                                                                    info);
-        out_shape.set(0, scaled_dims.first);
-        out_shape.set(1, scaled_dims.second);
-        out_shape.set(2, kernel_shape[3]);
-        return out_shape;
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class DirectConvolutionValidationTensorShiftFixture : public DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type)
-    {
-        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type,
-                                                                                                               QuantizationInfo());
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class DirectConvolutionValidationQuantizedTensorShiftFixture : public DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type, QuantizationInfo quantization_info)
-    {
-        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type,
-                                                                                                               quantization_info);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class DirectConvolutionValidationWithTensorShapesQuantizedTensorShiftFixture : public DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, unsigned int dilation_x, unsigned int dilation_y,
-               DataType data_type, QuantizationInfo quantization_info)
-    {
-        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation_x, dilation_y, data_type,
-                                                                                                               quantization_info);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class DirectConvolutionValidationWithTensorShapesTensorShiftFixture : public DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, unsigned int dilation_x, unsigned int dilation_y,
-               DataType data_type)
-    {
-        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation_x, dilation_y, data_type,
-                                                                                                               QuantizationInfo());
-    }
-};
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/fixtures/DropoutLayerFixture.h b/tests/validation/fixtures/DropoutLayerFixture.h
index 63df936032..a84f2a6407 100644
--- a/tests/validation/fixtures/DropoutLayerFixture.h
+++ b/tests/validation/fixtures/DropoutLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class DropoutLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, float ratio, bool forward, DataType data_type)
     {
         _target = compute_target(shape, ratio, forward, data_type);
@@ -70,17 +69,17 @@ protected:
         FunctionType dropout_layer;
         dropout_layer.configure(&src, &mask, &dst, ratio, forward);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         mask.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!mask.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!mask.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/ElementWiseUnaryFixture.h b/tests/validation/fixtures/ElementWiseUnaryFixture.h
deleted file mode 100644
index 8cffef48f6..0000000000
--- a/tests/validation/fixtures/ElementWiseUnaryFixture.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_ELEMENTWISE_UNARY_FIXTURE
-#define ARM_COMPUTE_TEST_ELEMENTWISE_UNARY_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/ElementWiseUnary.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ElementWiseUnaryValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, DataType input_data_type, bool in_place, ElementWiseUnary op)
-    {
-        _op        = op;
-        _target    = compute_target(input_shape, input_data_type, in_place);
-        _reference = compute_reference(input_shape, input_data_type);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i, DataType data_type)
-    {
-        using FloatType             = typename std::conditional < std::is_same<T, half>::value || std::is_floating_point<T>::value, T, float >::type;
-        using FloatDistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<FloatType>>::type;
-
-        switch(_op)
-        {
-            case ElementWiseUnary::EXP:
-            {
-                FloatDistributionType distribution{ FloatType(-1.0f), FloatType(1.0f) };
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            case ElementWiseUnary::RSQRT:
-            {
-                FloatDistributionType distribution{ FloatType(1.0f), FloatType(2.0f) };
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            case ElementWiseUnary::ABS:
-            case ElementWiseUnary::NEG:
-            {
-                switch(data_type)
-                {
-                    case DataType::F16:
-                    {
-                        arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -2.0f, 2.0f };
-                        library->fill(tensor, distribution, i);
-                        break;
-                    }
-                    case DataType::F32:
-                    {
-                        FloatDistributionType distribution{ FloatType(-2.0f), FloatType(2.0f) };
-                        library->fill(tensor, distribution, i);
-                        break;
-                    }
-                    case DataType::S32:
-                    {
-                        std::uniform_int_distribution<int32_t> distribution(-100, 100);
-                        library->fill(tensor, distribution, i);
-                        break;
-                    }
-                    default:
-                        ARM_COMPUTE_ERROR("DataType for Elementwise Negation Not implemented");
-                }
-                break;
-            }
-            case ElementWiseUnary::LOG:
-            {
-                FloatDistributionType distribution{ FloatType(0.0000001f), FloatType(100.0f) };
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            case ElementWiseUnary::SIN:
-            {
-                FloatDistributionType distribution{ FloatType(-100.00f), FloatType(100.00f) };
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            case ElementWiseUnary::ROUND:
-            {
-                FloatDistributionType distribution{ FloatType(100.0f), FloatType(-100.0f) };
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type, bool in_place)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
-
-        TensorType *actual_dst = in_place ? &src : &dst;
-
-        // Create and configure function
-        FunctionType elwiseunary_layer;
-        elwiseunary_layer.configure(&src, actual_dst);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        src.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        if(!in_place)
-        {
-            ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-            dst.allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        }
-
-        // Fill tensors
-        fill(AccessorType(src), 0, data_type);
-
-        // Compute function
-        elwiseunary_layer.run();
-
-        if(in_place)
-        {
-            return src;
-        }
-        else
-        {
-            return dst;
-        }
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src, 0, data_type);
-
-        return reference::elementwise_unary<T>(src, _op);
-    }
-
-    TensorType       _target{};
-    SimpleTensor<T>  _reference{};
-    ElementWiseUnary _op{};
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class RsqrtValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type)
-    {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::RSQRT);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ExpValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type)
-    {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::EXP);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class NegValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type)
-    {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::NEG);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class NegValidationInPlaceFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type, bool in_place)
-    {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, in_place, ElementWiseUnary::NEG);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class LogValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type)
-    {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::LOG);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class AbsValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type)
-    {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::ABS);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class SinValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type)
-    {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::SIN);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class RoundValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type)
-    {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::ROUND);
-    }
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_ELEMENTWISE_UNARY_FIXTURE */
diff --git a/tests/validation/fixtures/ElementwiseOperationsFixture.h b/tests/validation/fixtures/ElementwiseOperationsFixture.h
index dcb408c801..f36a1f75b7 100644
--- a/tests/validation/fixtures/ElementwiseOperationsFixture.h
+++ b/tests/validation/fixtures/ElementwiseOperationsFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_ELEMENTWISE_OPERATIONS_FIXTURE
-#define ARM_COMPUTE_TEST_ELEMENTWISE_OPERATIONS_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_ELEMENTWISEOPERATIONSFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_ELEMENTWISEOPERATIONSFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -45,12 +46,14 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticOperationsGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(ArithmeticOperation op, const TensorShape &shape0, const TensorShape &shape1,
                DataType data_type0, DataType data_type1, DataType output_data_type,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace = false, bool use_dynamic_shape = false)
     {
-        _op        = op;
+        _op                = op;
+        _use_dynamic_shape = use_dynamic_shape;
+        _is_inplace        = is_inplace;
+
         _target    = compute_target(shape0, shape1, data_type0, data_type1, output_data_type, qinfo0, qinfo1, qinfo_out);
         _reference = compute_reference(shape0, shape1, data_type0, data_type1, output_data_type, qinfo0, qinfo1, qinfo_out);
     }
@@ -83,26 +86,67 @@ protected:
                               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
     {
         // Create tensors
-        TensorType ref_src1 = create_tensor<TensorType>(shape0, data_type0, 1, qinfo0);
-        TensorType ref_src2 = create_tensor<TensorType>(shape1, data_type1, 1, qinfo1);
-        TensorType dst      = create_tensor<TensorType>(TensorShape::broadcast_shape(shape0, shape1), output_data_type, 1, qinfo_out);
+        const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1);
+        TensorType        ref_src1  = create_tensor<TensorType>(shape0, data_type0, 1, qinfo0);
+        TensorType        ref_src2  = create_tensor<TensorType>(shape1, data_type1, 1, qinfo1);
+        TensorType        dst       = create_tensor<TensorType>(out_shape, output_data_type, 1, qinfo_out);
+
+        // Check whether do in-place computation and whether inputs are broadcast compatible
+        TensorType *actual_dst = &dst;
+        if(_is_inplace)
+        {
+            bool src1_is_inplace = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) && (qinfo0 == qinfo_out) && (data_type0 == output_data_type);
+            bool src2_is_inplace = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) && (qinfo1 == qinfo_out) && (data_type1 == output_data_type);
+            bool do_in_place     = out_shape.total_size() != 0 && (src1_is_inplace || src2_is_inplace);
+            ARM_COMPUTE_ASSERT(do_in_place);
+
+            if(src1_is_inplace)
+            {
+                actual_dst = &ref_src1;
+            }
+            else
+            {
+                actual_dst = &ref_src2;
+            }
+        }
+
+        // if _use_dynamic_shape is true, this fixture will test scenario for dynamic shapes.
+        // - At configure time, all input tensors are marked as dynamic using set_tensor_dynamic()
+        // - After configure, tensors are marked as static for run using set_tensor_static()
+        // - The tensors with static shape are given to run()
+        if(_use_dynamic_shape)
+        {
+            set_tensor_dynamic(ref_src1);
+            set_tensor_dynamic(ref_src2);
+        }
 
         // Create and configure function
         FunctionType elem_op;
-        elem_op.configure(&ref_src1, &ref_src2, &dst);
+        elem_op.configure(&ref_src1, &ref_src2, actual_dst);
 
-        ARM_COMPUTE_EXPECT(ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        if(_use_dynamic_shape)
+        {
+            set_tensor_static(ref_src1);
+            set_tensor_static(ref_src2);
+        }
+
+        ARM_COMPUTE_ASSERT(ref_src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(ref_src2.info()->is_resizable());
 
         // Allocate tensors
         ref_src1.allocator()->allocate();
         ref_src2.allocator()->allocate();
-        dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        // If don't do in-place computation, still need to allocate original dst
+        if(!_is_inplace)
+        {
+            ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+            dst.allocator()->allocate();
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        }
+
+        ARM_COMPUTE_ASSERT(!ref_src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!ref_src2.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(ref_src1), 0);
@@ -111,7 +155,7 @@ protected:
         // Compute function
         elem_op.run();
 
-        return dst;
+        return std::move(*actual_dst);
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &shape0, const TensorShape &shape1,
@@ -133,6 +177,8 @@ protected:
     TensorType          _target{};
     SimpleTensor<T>     _reference{};
     ArithmeticOperation _op{ ArithmeticOperation::ADD };
+    bool                _use_dynamic_shape{ false };
+    bool                _is_inplace{ false };
 };
 
 // Arithmetic operation fused with activation function
@@ -140,15 +186,15 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticOperationsFuseActivationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(ArithmeticOperation op, const TensorShape &shape0, const TensorShape &shape1,
                DataType data_type0, DataType data_type1, DataType output_data_type,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, ActivationLayerInfo act_info)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, ActivationLayerInfo act_info, bool is_inplace = true)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape0, shape1,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             qinfo0, qinfo1, qinfo_out);
-        _act_info = act_info;
+                                                                                             qinfo0, qinfo1, qinfo_out, is_inplace);
+        _act_info   = act_info;
+        _is_inplace = is_inplace;
     }
 
 protected:
@@ -156,26 +202,51 @@ protected:
                               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
     {
         // Create tensors
-        TensorType ref_src1 = create_tensor<TensorType>(shape0, data_type0, 1, qinfo0);
-        TensorType ref_src2 = create_tensor<TensorType>(shape1, data_type1, 1, qinfo1);
-        TensorType dst      = create_tensor<TensorType>(TensorShape::broadcast_shape(shape0, shape1), output_data_type, 1, qinfo_out);
+        const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1);
+        TensorType        ref_src1  = create_tensor<TensorType>(shape0, data_type0, 1, qinfo0);
+        TensorType        ref_src2  = create_tensor<TensorType>(shape1, data_type1, 1, qinfo1);
+        TensorType        dst       = create_tensor<TensorType>(out_shape, output_data_type, 1, qinfo_out);
+
+        // Check whether do in-place computation and whether inputs are broadcast compatible
+        TensorType *actual_dst = &dst;
+        if(_is_inplace)
+        {
+            bool src1_is_inplace = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) && (qinfo0 == qinfo_out) && (data_type0 == output_data_type);
+            bool src2_is_inplace = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) && (qinfo1 == qinfo_out) && (data_type1 == output_data_type);
+            bool do_in_place     = out_shape.total_size() != 0 && (src1_is_inplace || src2_is_inplace);
+            ARM_COMPUTE_ASSERT(do_in_place);
+
+            if(src1_is_inplace)
+            {
+                actual_dst = &ref_src1;
+            }
+            else
+            {
+                actual_dst = &ref_src2;
+            }
+        }
 
         // Create and configure function
         FunctionType elem_op;
-        elem_op.configure(&ref_src1, &ref_src2, &dst, _act_info);
+        elem_op.configure(&ref_src1, &ref_src2, actual_dst, _act_info);
 
-        ARM_COMPUTE_EXPECT(ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(ref_src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(ref_src2.info()->is_resizable());
 
         // Allocate tensors
         ref_src1.allocator()->allocate();
         ref_src2.allocator()->allocate();
-        dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        // If don't do in-place computation, still need to allocate original dst
+        if(!_is_inplace)
+        {
+            ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+            dst.allocator()->allocate();
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        }
+
+        ARM_COMPUTE_ASSERT(!ref_src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!ref_src2.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(ref_src1), 0);
@@ -184,7 +255,7 @@ protected:
         // Compute function
         elem_op.run();
 
-        return dst;
+        return std::move(*actual_dst);
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &shape0, const TensorShape &shape1,
@@ -197,18 +268,18 @@ protected:
     }
 
     ActivationLayerInfo _act_info{};
+    bool                _is_inplace{ false };
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class ArithmeticDivisionBroadcastValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape0, shape1,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo());
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
     }
 };
 
@@ -216,12 +287,35 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticDivisionValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape, shape,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo());
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArithmeticDivisionBroadcastDynamicShapeValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
+    {
+        ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape0, shape1,
+                                                                                             data_type0, data_type1, output_data_type,
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace, true /* use_dynamic_shape */);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArithmeticDivisionDynamicShapeValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
+    {
+        ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape, shape,
+                                                                                             data_type0, data_type1, output_data_type,
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace, true /* use_dynamic_shape */);
     }
 };
 
@@ -229,12 +323,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticDivisionBroadcastValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
     {
         ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape0, shape1,
                                                                                                     data_type0, data_type1, output_data_type,
-                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -242,12 +335,23 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticDivisionValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
     {
         ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape, shape,
                                                                                                     data_type0, data_type1, output_data_type,
-                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArithmeticDivisionValidationIntegerFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
+    {
+        ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape, shape,
+                                                                                                    data_type0, data_type1, output_data_type,
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -255,14 +359,13 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ArithmeticDivisionValidationQuantizedFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
 
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape, shape,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             qinfo0, qinfo1, qinfo_out);
+                                                                                             qinfo0, qinfo1, qinfo_out, is_inplace);
     }
 };
 
@@ -270,12 +373,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMaxBroadcastValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MAX, shape0, shape1,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo());
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
     }
 };
 
@@ -283,12 +385,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMaxValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MAX, shape, shape,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo());
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
     }
 };
 
@@ -296,12 +397,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMaxBroadcastValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
     {
         ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MAX, shape0, shape1,
                                                                                                     data_type0, data_type1, output_data_type,
-                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -309,12 +409,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMaxValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
     {
         ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MAX, shape, shape,
                                                                                                     data_type0, data_type1, output_data_type,
-                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -322,14 +421,13 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMaxValidationQuantizedFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
 
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MAX, shape, shape,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             qinfo0, qinfo1, qinfo_out);
+                                                                                             qinfo0, qinfo1, qinfo_out, is_inplace);
     }
 };
 
@@ -337,14 +435,13 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMaxQuantizedBroadcastValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
 
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MAX, shape0, shape1,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             qinfo0, qinfo1, qinfo_out);
+                                                                                             qinfo0, qinfo1, qinfo_out, is_inplace);
     }
 };
 
@@ -352,12 +449,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMinBroadcastValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MIN, shape0, shape1,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo());
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
     }
 };
 
@@ -365,12 +461,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMinValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MIN, shape, shape,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo());
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
     }
 };
 
@@ -378,12 +473,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMinBroadcastValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
     {
         ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MIN, shape0, shape1,
                                                                                                     data_type0, data_type1, output_data_type,
-                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -391,12 +485,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMinValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
     {
         ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MIN, shape, shape,
                                                                                                     data_type0, data_type1, output_data_type,
-                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -404,14 +497,13 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMinValidationQuantizedFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
 
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MIN, shape, shape,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             qinfo0, qinfo1, qinfo_out);
+                                                                                             qinfo0, qinfo1, qinfo_out, is_inplace);
     }
 };
 
@@ -419,14 +511,13 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseMinQuantizedBroadcastValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
 
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::MIN, shape0, shape1,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             qinfo0, qinfo1, qinfo_out);
+                                                                                             qinfo0, qinfo1, qinfo_out, is_inplace);
     }
 };
 
@@ -434,12 +525,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseSquaredDiffBroadcastValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::SQUARED_DIFF, shape0, shape1,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo());
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
     }
 };
 
@@ -447,12 +537,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseSquaredDiffValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::SQUARED_DIFF, shape, shape,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo());
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
     }
 };
 
@@ -460,12 +549,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseSquaredDiffBroadcastValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
     {
         ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::SQUARED_DIFF, shape0, shape1,
                                                                                                     data_type0, data_type1, output_data_type,
-                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -473,12 +561,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseSquaredDiffValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
     {
         ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::SQUARED_DIFF, shape, shape,
                                                                                                     data_type0, data_type1, output_data_type,
-                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -486,14 +573,13 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseSquaredDiffValidationQuantizedFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
 
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::SQUARED_DIFF, shape, shape,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             qinfo0, qinfo1, qinfo_out);
+                                                                                             qinfo0, qinfo1, qinfo_out, is_inplace);
     }
 };
 
@@ -501,14 +587,13 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwiseSquaredDiffQuantizedBroadcastValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
 
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::SQUARED_DIFF, shape0, shape1,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             qinfo0, qinfo1, qinfo_out);
+                                                                                             qinfo0, qinfo1, qinfo_out, is_inplace);
     }
 };
 
@@ -516,7 +601,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PReluLayerBroadcastValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::PRELU, shape0, shape1,
@@ -529,7 +613,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PReluLayerValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::PRELU, shape, shape,
@@ -542,7 +625,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PReluLayerValidationQuantizedFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type,
                QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
 
@@ -557,7 +639,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PReluLayerQuantizedBroadcastValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type,
                QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
 
@@ -572,12 +653,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwisePowerBroadcastValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::POWER, shape0, shape1,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo());
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
     }
 };
 
@@ -585,12 +665,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwisePowerValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, bool is_inplace)
     {
         ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::POWER, shape, shape,
                                                                                              data_type0, data_type1, output_data_type,
-                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo());
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), is_inplace);
     }
 };
 
@@ -598,12 +677,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwisePowerBroadcastValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
     {
         ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::POWER, shape0, shape1,
                                                                                                     data_type0, data_type1, output_data_type,
-                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -611,16 +689,15 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ElementwisePowerValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ActivationLayerInfo act_info, bool is_inplace)
     {
         ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::POWER, shape, shape,
                                                                                                     data_type0, data_type1, output_data_type,
-                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                    QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_ARITHMETIC_OPERATIONS_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_ELEMENTWISEOPERATIONSFIXTURE_H
diff --git a/tests/validation/fixtures/ElementwiseUnaryFixture.h b/tests/validation/fixtures/ElementwiseUnaryFixture.h
new file mode 100644
index 0000000000..15344288db
--- /dev/null
+++ b/tests/validation/fixtures/ElementwiseUnaryFixture.h
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_ELEMENTWISE_UNARY_FIXTURE
+#define ARM_COMPUTE_TEST_ELEMENTWISE_UNARY_FIXTURE
+
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/ElementwiseUnary.h"
+
+#include <tuple>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ElementWiseUnaryValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape input_shape, DataType input_data_type, bool in_place, ElementWiseUnary op,
+               bool use_dynamic_shape = false, QuantizationInfo qinfo = QuantizationInfo(), QuantizationInfo qinfo_out = QuantizationInfo())
+    {
+        _op                = op;
+        _target            = compute_target(input_shape, input_data_type, in_place, qinfo, qinfo_out);
+        _reference         = compute_reference(input_shape, input_data_type, qinfo, qinfo_out);
+        _use_dynamic_shape = use_dynamic_shape;
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, DataType data_type)
+    {
+        using FloatType             = typename std::conditional < std::is_same<T, half>::value || std::is_floating_point<T>::value, T, float >::type;
+        using FloatDistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<FloatType>>::type;
+
+        switch(_op)
+        {
+            case ElementWiseUnary::EXP:
+            {
+                switch(data_type)
+                {
+                    case DataType::F32:
+                    {
+                        FloatDistributionType distribution{ FloatType(-86.63f), FloatType(88.36f) };
+                        library->fill(tensor, distribution, i);
+                        break;
+                    }
+
+                    case DataType::F16:
+                    {
+                        FloatDistributionType distribution{ FloatType(-9.00f), FloatType(10.73f) };
+                        library->fill(tensor, distribution, i);
+                        break;
+                    }
+
+                    case DataType::QASYMM8:
+                    case DataType::QASYMM8_SIGNED:
+                        library->fill_tensor_uniform(tensor, i);
+                        break;
+
+                    default:
+                        ARM_COMPUTE_ERROR("Not implemented");
+                }
+
+                break;
+            }
+            case ElementWiseUnary::RSQRT:
+            case ElementWiseUnary::LOG:
+            {
+                // For floating-point data type, the chosen input range is all positive numbers
+                // (i.e. positive and negative zeros are excluded).
+                switch(data_type)
+                {
+                    case DataType::F32:
+                    {
+                        FloatDistributionType distribution{ std::numeric_limits<float>::min(), std::numeric_limits<float>::max() };
+                        library->fill(tensor, distribution, i);
+                        break;
+                    }
+
+                    case DataType::F16:
+                    {
+                        FloatDistributionType distribution{ FloatType(0.00006103515625f), FloatType(65504.0f) };
+                        library->fill(tensor, distribution, i);
+                        break;
+                    }
+
+                    case DataType::QASYMM8:
+                    case DataType::QASYMM8_SIGNED:
+                        library->fill_tensor_uniform(tensor, i);
+                        break;
+
+                    default:
+                        ARM_COMPUTE_ERROR("Not implemented");
+                }
+
+                break;
+            }
+            case ElementWiseUnary::SIN:
+            {
+                switch(data_type)
+                {
+                    case DataType::F32:
+                    case DataType::F16:
+                    {
+                        FloatDistributionType distribution{ FloatType(-100.0f), FloatType(100.0f) };
+                        library->fill(tensor, distribution, i);
+                        break;
+                    }
+
+                    case DataType::S32:
+                    {
+                        std::uniform_int_distribution<int32_t> distribution(std::numeric_limits<int32_t>::lowest(), std::numeric_limits<int32_t>::max());
+                        library->fill(tensor, distribution, i);
+                        break;
+                    }
+
+                    case DataType::QASYMM8:
+                    case DataType::QASYMM8_SIGNED:
+                        library->fill_tensor_uniform(tensor, i);
+                        break;
+
+                    default:
+                        ARM_COMPUTE_ERROR("Not implemented");
+                }
+
+                break;
+            }
+            case ElementWiseUnary::ABS:
+            case ElementWiseUnary::NEG:
+            case ElementWiseUnary::ROUND:
+            {
+                switch(data_type)
+                {
+                    case DataType::F32:
+                    {
+                        FloatDistributionType distribution{ std::numeric_limits<float>::lowest() / 2, std::numeric_limits<float>::max() / 2 };
+                        library->fill(tensor, distribution, i);
+                        break;
+                    }
+
+                    case DataType::F16:
+                    {
+                        FloatDistributionType distribution{ FloatType(-65504.0f), FloatType(65504.0f) };
+                        library->fill(tensor, distribution, i);
+                        break;
+                    }
+
+                    case DataType::S32:
+                    {
+                        std::uniform_int_distribution<int32_t> distribution(std::numeric_limits<int32_t>::lowest(), std::numeric_limits<int32_t>::max());
+                        library->fill(tensor, distribution, i);
+                        break;
+                    }
+
+                    case DataType::QASYMM8:
+                    case DataType::QASYMM8_SIGNED:
+                        library->fill_tensor_uniform(tensor, i);
+                        break;
+
+                    default:
+                        ARM_COMPUTE_ERROR("Not implemented");
+                }
+
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Not implemented");
+        }
+    }
+
+    TensorType compute_target(const TensorShape &shape, DataType data_type, bool in_place, QuantizationInfo qinfo, QuantizationInfo qinfo_out)
+    {
+        // Create tensors
+        TensorType  src        = create_tensor<TensorType>(shape, data_type, 1, qinfo);
+        TensorType  dst        = create_tensor<TensorType>(shape, data_type, 1, qinfo_out);
+        TensorType *actual_dst = in_place ? &src : &dst;
+
+        // if _use_dynamic_shape is true, this fixture will test scenario for dynamic shapes.
+        // - At configure time, all input tensors are marked as dynamic using set_tensor_dynamic()
+        // - After configure, tensors are marked as static for run using set_tensor_static()
+        // - The tensors with static shape are given to run()
+        if(_use_dynamic_shape)
+        {
+            set_tensor_dynamic(src);
+        }
+
+        // Create and configure function
+        FunctionType elwiseunary_layer;
+        elwiseunary_layer.configure(&src, actual_dst);
+
+        if(_use_dynamic_shape)
+        {
+            set_tensor_static(src);
+        }
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        src.allocator()->allocate();
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        if(!in_place)
+        {
+            ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+            dst.allocator()->allocate();
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        }
+
+        // Fill tensors
+        fill(AccessorType(src), 0, data_type);
+
+        // Compute function
+        elwiseunary_layer.run();
+
+        if(in_place)
+        {
+            return src;
+        }
+        else
+        {
+            return dst;
+        }
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, QuantizationInfo qinfo, QuantizationInfo qinfo_out)
+    {
+        // Create reference
+        SimpleTensor<T> src{ shape, data_type, 1, qinfo };
+        SimpleTensor<T> dst{ shape, data_type, 1, qinfo_out };
+
+        // Fill reference
+        fill(src, 0, data_type);
+
+        return reference::elementwise_unary<T>(src, dst, _op);
+    }
+
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+    ElementWiseUnary _op{};
+    bool             _use_dynamic_shape{ false };
+    QuantizationInfo _input_qinfo{};
+    QuantizationInfo _output_qinfo{};
+};
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class RsqrtQuantizedValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, QuantizationInfo qinfo, QuantizationInfo qinfo_out)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::RSQRT, false, qinfo, qinfo_out);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class RsqrtValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::RSQRT);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class RsqrtDynamicShapeValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::RSQRT, true);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ExpValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::EXP);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ExpQuantizedValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, QuantizationInfo iq, QuantizationInfo oq)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::EXP, false, iq, oq);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class NegValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::NEG);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class NegQuantizedValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, QuantizationInfo iq, QuantizationInfo oq)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::NEG, false, iq, oq);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class NegValidationInPlaceFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, bool in_place)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, in_place, ElementWiseUnary::NEG);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class NegQuantizedValidationInPlaceFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, bool in_place, QuantizationInfo iq, QuantizationInfo oq)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, in_place, ElementWiseUnary::NEG, false, iq, oq);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class LogValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::LOG);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class LogQuantizedValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, QuantizationInfo iq, QuantizationInfo oq)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::LOG, false, iq, oq);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class AbsValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::ABS);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class AbsQuantizedValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, QuantizationInfo iq, QuantizationInfo oq)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::ABS, false, iq, oq);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class SinValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::SIN);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class SinQuantizedValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, QuantizationInfo iq, QuantizationInfo oq)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::SIN, false, iq, oq);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class RoundValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::ROUND);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class RoundQuantizedValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape, DataType data_type, QuantizationInfo iq, QuantizationInfo oq)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::ROUND, false, iq, oq);
+    }
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_ELEMENTWISE_UNARY_FIXTURE */
diff --git a/tests/validation/fixtures/FFTFixture.h b/tests/validation/fixtures/FFTFixture.h
index 86a97272a0..024227b22a 100644
--- a/tests/validation/fixtures/FFTFixture.h
+++ b/tests/validation/fixtures/FFTFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class FFTValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
         _target    = compute_target(shape, data_type);
@@ -88,15 +87,17 @@ protected:
         FunctionType fft;
         fft.configure(&src, &dst, InfoType());
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        add_padding_x({ &src, &dst });
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
@@ -132,18 +133,32 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class FFTConvolutionValidationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
-               DataType data_type, DataLayout data_layout, ActivationLayerInfo act_info)
+               DataType data_type, DataLayout data_layout, ActivationLayerInfo act_info, bool mixed_layout = false)
     {
-        _data_type   = data_type;
-        _data_layout = data_layout;
+        _mixed_layout = mixed_layout;
+        _data_type    = data_type;
+        _data_layout  = data_layout;
 
         _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info);
         _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info);
     }
 
 protected:
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
+    {
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(_data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+
+        // Compute Convolution function
+        layer.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(_data_layout);
+        dst.info()->set_data_layout(_data_layout);
+    }
+
     template <typename U>
     void fill(U &&tensor, int i)
     {
@@ -185,14 +200,16 @@ protected:
         TensorType bias    = create_tensor<TensorType>(bias_shape, _data_type, 1, QuantizationInfo(), _data_layout);
         TensorType dst     = create_tensor<TensorType>(output_shape, _data_type, 1, QuantizationInfo(), _data_layout);
 
+        add_padding_x({ &src, &weights, &bias, &dst }, _data_layout);
+
         // Create and configure function
         FunctionType conv;
         conv.configure(&src, &weights, &bias, &dst, info, act_info, _data_type == DataType::F16);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -200,19 +217,25 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
         fill(AccessorType(weights), 1);
         fill(AccessorType(bias), 2);
 
-        // Compute convolution function
-        conv.run();
-
+        if(_mixed_layout)
+        {
+            mix_layout(conv, src, dst);
+        }
+        else
+        {
+            // Compute Convolution function
+            conv.run();
+        }
         return dst;
     }
 
@@ -239,18 +262,18 @@ protected:
     SimpleTensor<T> _reference{};
     DataType        _data_type{};
     DataLayout      _data_layout{};
+    bool            _mixed_layout{ false };
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class FFTConvolutionValidationFixture : public FFTConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
                DataType data_type, DataLayout data_layout, ActivationLayerInfo act_info)
     {
         FFTConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation,
-                                                                                                 data_type, data_layout, act_info);
+                                                                                                 data_type, data_layout, act_info, mixed_layout);
     }
 };
 } // namespace validation
diff --git a/tests/validation/fixtures/FillFixture.h b/tests/validation/fixtures/FillFixture.h
index 706c13565d..0239a68903 100644
--- a/tests/validation/fixtures/FillFixture.h
+++ b/tests/validation/fixtures/FillFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class FillFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, DataType data_type)
     {
         _target = compute_target(input_shape, data_type);
diff --git a/tests/validation/fixtures/FlattenLayerFixture.h b/tests/validation/fixtures/FlattenLayerFixture.h
index 67c4d2a2b1..e72487c7cf 100644
--- a/tests/validation/fixtures/FlattenLayerFixture.h
+++ b/tests/validation/fixtures/FlattenLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class FlattenLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
         TensorShape shape_flatten;
@@ -83,15 +82,15 @@ protected:
         FunctionType flatten_layer;
         flatten_layer.configure(&src, &dst);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/FloorFixture.h b/tests/validation/fixtures/FloorFixture.h
index 9388486983..7d38666f47 100644
--- a/tests/validation/fixtures/FloorFixture.h
+++ b/tests/validation/fixtures/FloorFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class FloorValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
         _target    = compute_target(shape, data_type);
@@ -68,15 +67,15 @@ protected:
         FunctionType floor_func;
         floor_func.configure(&src, &dst);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/FullyConnectedLayerFixture.h b/tests/validation/fixtures/FullyConnectedLayerFixture.h
index 3760cfb8b7..344187868f 100644
--- a/tests/validation/fixtures/FullyConnectedLayerFixture.h
+++ b/tests/validation/fixtures/FullyConnectedLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_FULLY_CONNECTED_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_FULLY_CONNECTED_LAYER_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_FULLYCONNECTEDLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_FULLYCONNECTEDLAYERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -34,6 +34,7 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
+#include "tests/validation/Validation.h"
 #include "tests/validation/reference/ActivationLayer.h"
 #include "tests/validation/reference/FullyConnectedLayer.h"
 #include "tests/validation/reference/Utils.h"
@@ -54,16 +55,63 @@ public:
     using TBias  = typename std::conditional < (std::is_same<TDecay, uint8_t>::value || std::is_same<TDecay, int8_t>::value), int32_t, T >::type;
 
 public:
-    template <typename...>
+    void setup_quantization(TensorShape weights_shape, TensorShape output_shape, QuantizationInfo &input_q_info, QuantizationInfo &weights_q_info, DataType data_type)
+    {
+        _hash = weights_shape[0] + weights_shape[1] + output_shape[0] + output_shape[1];
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + _hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        input_q_info = QuantizationInfo(scale_lhs, offset_lhs);
+        weights_q_info = QuantizationInfo(scale_rhs, offset_rhs);
+
+
+        const int k = weights_shape.x();
+        QuantizationHint q_hint = suggest_mac_dst_q_info_and_bias(input_q_info, weights_q_info, k, data_type, 0.1f /* bias_fraction */, 4 /* number of standard deviations*/);
+
+        _dst_q_info = q_hint.q_info;
+        _min_bias = q_hint.bias_min;
+        _max_bias = q_hint.bias_max;
+
+        // Do not change here as these limits are the natural limits of the associated data types and
+        // are embedded in the computation of the dst quantization info.
+        _min_u8 = 0;
+        _max_u8 = 255;
+        _min_s8 = -128;
+        _max_s8 = 127;
+    }
+
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, bool transpose_weights, bool reshape_weights,
-               DataType data_type, QuantizationInfo quantization_info, ActivationLayerInfo activation_info)
+               DataType data_type, QuantizationInfo quantization_info, ActivationLayerInfo activation_info, bool mixed_layout = false)
     {
         ARM_COMPUTE_UNUSED(weights_shape);
         ARM_COMPUTE_UNUSED(bias_shape);
 
+        _mixed_layout      = mixed_layout;
         _data_type         = data_type;
         _bias_data_type    = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
-        _quantization_info = quantization_info;
+
+        // Note : Quantization Info parameter from setup function is only used when quant datatype and activation function is not enabled or is identity.
+        if(is_data_type_quantized(data_type) && (!activation_info.enabled() || activation_info.activation() == ActivationFunction::IDENTITY))
+        {
+            // Initialises quantization info with appropriate scale and offset for given input shapes.
+            setup_quantization(weights_shape, output_shape,_input_q_info, _weight_q_info, data_type);
+        }
+        else
+        {
+            _input_q_info = quantization_info;
+            _weight_q_info = quantization_info;
+            _dst_q_info = quantization_info;
+        }
+
         _activation_info   = activation_info;
 
         _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, transpose_weights, reshape_weights);
@@ -71,22 +119,37 @@ public:
     }
 
 protected:
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
+    {
+        const DataLayout data_layout = src.info()->data_layout();
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+
+        // Compute Convolution function
+        layer.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(data_layout);
+        dst.info()->set_data_layout(data_layout);
+    }
+
     template <typename U>
     void fill(U &&tensor, int i)
     {
         if(_data_type == DataType::QASYMM8)
         {
-            std::uniform_int_distribution<uint8_t> distribution(0, 30);
+            std::uniform_int_distribution<uint32_t> distribution(_min_u8, _max_u8);
             library->fill(tensor, distribution, i);
         }
         else if(_data_type == DataType::QASYMM8_SIGNED)
         {
-            std::uniform_int_distribution<int8_t> distribution(-15, 15);
+            std::uniform_int_distribution<int32_t> distribution(_min_s8, _max_s8);
             library->fill(tensor, distribution, i);
         }
         else if(_data_type == DataType::S32)
         {
-            std::uniform_int_distribution<int32_t> distribution(-50, 50);
+            std::uniform_int_distribution<int32_t> distribution(_min_bias, _max_bias);
             library->fill(tensor, distribution, i);
         }
         else if(_data_type == DataType::F16)
@@ -128,10 +191,10 @@ protected:
         }
 
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, _data_type, 1, _quantization_info);
-        TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _data_type, 1, _quantization_info);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, _quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, _data_type, 1, _quantization_info);
+        TensorType src     = create_tensor<TensorType>(input_shape, _data_type, 1, _input_q_info);
+        TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _data_type, 1, _weight_q_info);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1);
+        TensorType dst     = create_tensor<TensorType>(output_shape, _data_type, 1, _dst_q_info);
 
         // Create Fully Connected layer info
         FullyConnectedLayerInfo fc_info;
@@ -143,10 +206,12 @@ protected:
         FunctionType fc;
         fc.configure(&src, &weights, &bias, &dst, fc_info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        add_padding_x({ &src, &weights, &bias, &dst });
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -154,14 +219,14 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
-        fill(AccessorType(src), 0);
-        fill(AccessorType(bias), 2);
+        fill(AccessorType(src), 0 + _hash);
+        fill(AccessorType(bias), 2 + _hash);
 
         if(!reshape_weights || !transpose_weights)
         {
@@ -169,7 +234,7 @@ protected:
             RawTensor   tmp(tmp_shape, _data_type, 1);
 
             // Fill with original shape
-            fill(tmp, 1);
+            fill(tmp, 1 + _hash);
 
             // Transpose elementwise
             tmp = transpose(tmp);
@@ -186,11 +251,18 @@ protected:
         }
         else
         {
-            fill(AccessorType(weights), 1);
+            fill(AccessorType(weights), 1 + _hash);
         }
 
-        // Compute NEFullyConnectedLayer function
-        fc.run();
+        if(_mixed_layout)
+        {
+            mix_layout(fc, src, dst);
+        }
+        else
+        {
+            // Compute NEFullyConnectedLayer function
+            fc.run();
+        }
 
         return dst;
     }
@@ -198,54 +270,384 @@ protected:
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape)
     {
         // Create reference
-        SimpleTensor<T>     src{ input_shape, _data_type, 1, _quantization_info };
-        SimpleTensor<T>     weights{ weights_shape, _data_type, 1, _quantization_info };
-        SimpleTensor<TBias> bias{ bias_shape, _bias_data_type, 1, _quantization_info };
+        SimpleTensor<T>     src{ input_shape, _data_type, 1, _input_q_info };
+        SimpleTensor<T>     weights{ weights_shape, _data_type, 1, _weight_q_info };
+        SimpleTensor<TBias> bias{ bias_shape, _bias_data_type, 1, QuantizationInfo() };
 
         // Fill reference
-        fill(src, 0);
-        fill(weights, 1);
-        fill(bias, 2);
+        fill(src, 0 + _hash);
+        fill(weights, 1 + _hash);
+        fill(bias, 2 + _hash);
 
-        return reference::activation_layer(reference::fully_connected_layer<T>(src, weights, bias, output_shape), _activation_info, _quantization_info);
+        return reference::activation_layer(reference::fully_connected_layer<T>(src, weights, bias, output_shape, _dst_q_info), _activation_info, _dst_q_info);
     }
 
     TensorType          _target{};
     SimpleTensor<T>     _reference{};
     DataType            _data_type{};
     DataType            _bias_data_type{};
-    QuantizationInfo    _quantization_info{};
+    bool                _mixed_layout{ false };
+    QuantizationInfo    _input_q_info{};
+    QuantizationInfo    _weight_q_info{};
+    QuantizationInfo    _dst_q_info{};
     ActivationLayerInfo _activation_info{};
+
+    // Random initialization limits
+    // Default values are previously handcrafted limits
+    // that sould be used when we don't use dynamic quantization
+    int32_t _min_bias{-50};
+    int32_t _max_bias{50};
+
+    int32_t _min_u8{0};
+    int32_t _max_u8{30};
+    int32_t _min_s8{-15};
+    int32_t _max_s8{15};
+    int    _hash{0};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class FullyConnectedLayerValidationFixture : public FullyConnectedLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, bool transpose_weights, bool reshape_weights, DataType data_type,
                ActivationLayerInfo activation_info)
     {
         FullyConnectedLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, transpose_weights,
                                                                                                       reshape_weights, data_type,
-                                                                                                      QuantizationInfo(), activation_info);
+                                                                                                      QuantizationInfo(), activation_info, mixed_layout);
     }
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class FullyConnectedLayerValidationQuantizedFixture : public FullyConnectedLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, bool transpose_weights, bool reshape_weights, DataType data_type,
                QuantizationInfo quantization_info, ActivationLayerInfo activation_info)
     {
         FullyConnectedLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, transpose_weights,
                                                                                                       reshape_weights, data_type,
-                                                                                                      quantization_info, activation_info);
+                                                                                                      quantization_info, activation_info, mixed_layout);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class FullyConnectedWithDynamicTensorsFixture : public framework::Fixture
+{
+private:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        if(_data_type == DataType::F16)
+        {
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution(-1.0f, 1.0f);
+            library->fill(tensor, distribution, i);
+        }
+        else if(_data_type == DataType::F32)
+        {
+            std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+            library->fill(tensor, distribution, i);
+        }
+        else if(_data_type == DataType::QASYMM8)
+        {
+            std::uniform_int_distribution<uint32_t> distribution(_min_u8, _max_u8);
+            library->fill(tensor, distribution, i);
+        }
+        else if(_data_type == DataType::QASYMM8_SIGNED)
+        {
+            std::uniform_int_distribution<int32_t> distribution(_min_s8, _max_s8);
+            library->fill(tensor, distribution, i);
+        }
+        else if(_data_type == DataType::S32)
+        {
+            std::uniform_int_distribution<int32_t> distribution(_min_bias, _max_bias);
+            library->fill(tensor, distribution, i);
+        }
+        else
+        {
+            library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    void fill_transposed_weights(TensorType &weights, TensorShape weights_shape, int seed)
+    {
+        RawTensor tmp(weights_shape, _data_type, 1);
+
+        // Fill with original shape
+        fill(tmp, seed);
+
+        // Transpose elementwise
+        tmp = transpose(tmp);
+
+        AccessorType weights_accessor(weights);
+
+        for(int i = 0; i < tmp.num_elements(); ++i)
+        {
+            Coordinates coord = index2coord(tmp.shape(), i);
+            std::copy_n(static_cast<const RawTensor::value_type *>(tmp(coord)),
+                        tmp.element_size(),
+                        static_cast<RawTensor::value_type *>(weights_accessor(coord)));
+        }
+    }
+
+    void validate_with_tolerance(TensorType &target, SimpleTensor<float> &ref)
+    {
+        constexpr RelativeTolerance<float> rel_tolerance_f32(0.01f);
+        constexpr AbsoluteTolerance<float> abs_tolerance_f32(0.001f);
+        validate(AccessorType(target), ref, rel_tolerance_f32, 0, abs_tolerance_f32);
+    }
+
+    void validate_with_tolerance(TensorType &target, SimpleTensor<half_float::half> &ref)
+    {
+        constexpr AbsoluteTolerance<float>        abs_tolerance_f16(0.3f);
+        const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f));
+        constexpr float                           tolerance_num_f16 = 0.07f;
+
+        validate(AccessorType(target), ref, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
+    }
+
+    void validate_with_tolerance(TensorType &target, SimpleTensor<uint8_t> &ref)
+    {
+        constexpr AbsoluteTolerance<uint32_t> tolerance_qasymm8(1);
+        validate(AccessorType(target), ref, tolerance_qasymm8);
+    }
+
+    void validate_with_tolerance(TensorType &target, SimpleTensor<int8_t> &ref)
+    {
+        constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8_signed(1);
+        validate(AccessorType(target), ref, tolerance_qasymm8_signed);
+    }
+
+    void setup_quantization(TensorShape weights_shape, TensorShape output_shape, QuantizationInfo &input_q_info, QuantizationInfo &weights_q_info, DataType data_type)
+    {
+        _hash = weights_shape[0] + weights_shape[1] + output_shape[0] + output_shape[1];
+
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + _hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        input_q_info = QuantizationInfo(scale_lhs, offset_lhs);
+        weights_q_info = QuantizationInfo(scale_rhs, offset_rhs);
+
+        const int k = weights_shape.x();
+        QuantizationHint q_hint = suggest_mac_dst_q_info_and_bias(input_q_info, weights_q_info, k, data_type, 0.1f /* bias_fraction */, 4 /* number of standard deviations*/);
+
+        _dst_q_info = q_hint.q_info;
+        _min_bias = q_hint.bias_min;
+        _max_bias = q_hint.bias_max;
+
+        // Do not change here as these limits are the natural limits of the associated data types and
+        // are embedded in the computation of the dst quantization info.
+        _min_u8 = 0;
+        _max_u8 = 255;
+        _min_s8 = -128;
+        _max_s8 = 127;
+    }
+
+public:
+    using TDecay = typename std::decay<T>::type;
+    using TBias  = typename std::conditional < (std::is_same<TDecay, uint8_t>::value || std::is_same<TDecay, int8_t>::value), int32_t, T >::type;
+
+    void setup(TensorShape src_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape dst_shape,
+               DataType data_type, ActivationLayerInfo activation_info, bool constant_weights, bool constant_bias, bool weights_reshaped, bool remove_bias = false)
+    {
+        _data_type = data_type;
+
+        const bool     is_quantized   = is_data_type_quantized(data_type);
+        const DataType bias_data_type = (is_quantized) ? DataType::S32 : data_type;
+
+        if (is_quantized && (!activation_info.enabled() || activation_info.activation() == ActivationFunction::IDENTITY))
+        {
+            setup_quantization(weights_shape, dst_shape, _src_q_info, _weights_q_info, data_type);
+        }
+        else
+        {
+            _src_q_info = QuantizationInfo(0.1f, 10);
+            _dst_q_info = QuantizationInfo(0.3f, 20);
+            _weights_q_info = QuantizationInfo(0.2f, 5);
+        }
+
+        // Configure TensorInfo Objects
+        const TensorInfo src_info(src_shape, 1, data_type, _src_q_info);
+        const TensorInfo dst_info(dst_shape, 1, data_type, _dst_q_info);
+        TensorInfo       bias_info(bias_shape, 1, bias_data_type);
+        TensorInfo       wei_info(weights_shape, 1, data_type, _weights_q_info);
+
+        if(!constant_weights && weights_reshaped)
+        {
+            const TensorShape tr_weights_shape{ weights_shape[1], weights_shape[0] };
+            wei_info.set_tensor_shape(tr_weights_shape);
+        }
+        wei_info.set_are_values_constant(constant_weights);
+        bias_info.set_are_values_constant(constant_bias);
+
+        // Initialise Tensors
+        _src.allocator()->init(src_info);
+        _weights.allocator()->init(wei_info);
+        if(!remove_bias)
+            _bias.allocator()->init(bias_info);
+        _dst.allocator()->init(dst_info);
+
+        // Configure FC layer and mark the weights as non constant
+        FullyConnectedLayerInfo fc_info;
+        fc_info.activation_info = activation_info;
+        if(!constant_weights)
+        {
+            fc_info.are_weights_reshaped = weights_reshaped;
+            fc_info.transpose_weights    = !weights_reshaped;
+        }
+        FunctionType fc;
+        fc.configure(&_src, &_weights, (remove_bias) ? nullptr : &_bias, &_dst, fc_info);
+
+        // Allocate all the tensors
+        _src.allocator()->allocate();
+        _weights.allocator()->allocate();
+        if(!remove_bias)
+            _bias.allocator()->allocate();
+        _dst.allocator()->allocate();
+
+        // Run multiple iterations with different inputs
+        constexpr int num_iterations    = 5;
+        int           randomizer_offset = 0;
+
+        // Create reference tensors
+        SimpleTensor<T>     src{ src_shape, data_type, 1, _src_q_info };
+        SimpleTensor<T>     weights{ weights_shape, data_type, 1, _weights_q_info };
+        SimpleTensor<TBias> bias{ bias_shape, bias_data_type };
+
+        // Fill weights and/or bias if they remain constant
+        if(constant_weights)
+        {
+            fill(AccessorType(_weights), 1 + _hash);
+            fill(weights, 1 + _hash);
+        }
+        if(constant_bias && !remove_bias)
+        {
+            fill(AccessorType(_bias), 2 + _hash);
+            fill(bias, 2 + _hash);
+        }
+        // To remove bias, fill with 0
+        if(remove_bias && is_quantized)
+        {
+            library->fill_tensor_value(bias, 0);
+        }
+        else if(remove_bias)
+        {
+            library->fill_tensor_value(bias, (float)0.0);
+        }
+
+        for(int i = 0; i < num_iterations; ++i)
+        {
+            // Run target
+            {
+                fill(AccessorType(_src), randomizer_offset);
+                if(!constant_weights)
+                {
+                    if(weights_reshaped)
+                    {
+                        fill_transposed_weights(_weights, weights_shape, randomizer_offset + 1 + _hash);
+                    }
+                    else
+                    {
+                        fill(AccessorType(_weights), randomizer_offset + 1 +_hash);
+                    }
+                }
+                if(!constant_bias && !remove_bias)
+                {
+                    fill(AccessorType(_bias), randomizer_offset + 2 + _hash);
+                }
+
+                fc.run();
+            }
+
+            // Run reference and compare
+            {
+                // Fill reference
+                fill(src, randomizer_offset);
+                if(!constant_weights)
+                {
+                    fill(weights, randomizer_offset + 1 + _hash);
+                }
+                if(!constant_bias && !remove_bias)
+                {
+                    fill(bias, randomizer_offset + 2 + _hash);
+                }
+
+                auto dst = reference::activation_layer(reference::fully_connected_layer<T>(src, weights, bias, dst_shape, _dst_q_info), activation_info, _dst_q_info);
+
+                // Validate
+                validate_with_tolerance(_dst, dst);
+            }
+
+            randomizer_offset += 100;
+        }
+    }
+
+private:
+    TensorType _src{}, _weights{}, _bias{}, _dst{};
+    DataType   _data_type{ DataType::UNKNOWN };
+
+    QuantizationInfo _src_q_info{};
+    QuantizationInfo _weights_q_info{};
+    QuantizationInfo _dst_q_info{};
+
+    // Random initialization limits
+    // Default values are previously handcrafted limits
+    // that sould be used when we don't use dynamic quantization
+    int32_t _min_bias{-50};
+    int32_t _max_bias{50};
+
+    int32_t _min_u8{0};
+    int32_t _max_u8{30};
+    int32_t _min_s8{-15};
+    int32_t _max_s8{15};
+    int     _hash{0};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class FullyConnectedWithDynamicWeightsFixture : public FullyConnectedWithDynamicTensorsFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape src_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape dst_shape,
+               DataType data_type, ActivationLayerInfo activation_info, bool weights_reshaped)
+    {
+        FullyConnectedWithDynamicTensorsFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, weights_shape, bias_shape,
+                                                                                                  dst_shape, data_type, activation_info, false, true, weights_reshaped, false);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class FullyConnectedDynamicNoBiasFixture : public FullyConnectedWithDynamicTensorsFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape src_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape dst_shape,
+               DataType data_type, ActivationLayerInfo activation_info, bool weights_reshaped)
+    {
+        FullyConnectedWithDynamicTensorsFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, weights_shape, bias_shape,
+                                                                                                  dst_shape, data_type, activation_info, false, true, weights_reshaped, true);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class FullyConnectedWithDynamicBiasFixture : public FullyConnectedWithDynamicTensorsFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape src_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape dst_shape,
+               DataType data_type, ActivationLayerInfo activation_info)
+    {
+        FullyConnectedWithDynamicTensorsFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, weights_shape, bias_shape,
+                                                                                                  dst_shape, data_type, activation_info, true, false, false, false);
     }
 };
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_FULLY_CONNECTED_LAYER_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_FULLYCONNECTEDLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/FuseBatchNormalizationFixture.h b/tests/validation/fixtures/FuseBatchNormalizationFixture.h
index 552dc7c360..a05e4169a7 100644
--- a/tests/validation/fixtures/FuseBatchNormalizationFixture.h
+++ b/tests/validation/fixtures/FuseBatchNormalizationFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, int
 class FuseBatchNormalizationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape_w, DataType data_type, DataLayout data_layout, bool in_place, bool with_bias, bool with_gamma, bool with_beta)
     {
         std::tie(_target_w, _target_b)       = compute_target(shape_w, data_type, data_layout, in_place, with_bias, with_gamma, with_beta);
@@ -96,14 +95,14 @@ protected:
         FunctionType fuse_batch_normalization;
         fuse_batch_normalization.configure(&w, &mean, &var, w_fused_to_use, b_fused_to_use, b_to_use, beta_to_use, gamma_to_use, _epsilon, fuse_bn_type);
 
-        ARM_COMPUTE_EXPECT(w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(mean.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(var.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(w_fused.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(b_fused.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(beta.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(gamma.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(mean.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(var.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(w_fused.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b_fused.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(beta.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(gamma.info()->is_resizable());
 
         // Allocate tensors
         w.allocator()->allocate();
@@ -115,14 +114,14 @@ protected:
         beta.allocator()->allocate();
         gamma.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!mean.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!var.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!w_fused.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!b_fused.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!beta.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!gamma.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!mean.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!var.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!w_fused.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b_fused.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!beta.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!gamma.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(w), 0U, -1.0f, 1.0f);
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index 500e094e18..94bedc83e1 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMM_FIXTURE
-#define ARM_COMPUTE_TEST_GEMM_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorShape.h"
@@ -34,6 +34,7 @@
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
 #include "tests/validation/reference/GEMM.h"
 
 #include <random>
@@ -44,16 +45,15 @@ namespace test
 {
 namespace validation
 {
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool disable_c = false, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false>
-class GEMMValidationFixture : public framework::Fixture
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool disable_c = false, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool pretranspose_a = false, bool pretranspose_b = false, bool run_twice = false>
+class GEMMGenericValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type, bool accumulate=false)
     {
         ARM_COMPUTE_UNUSED(pretranspose);
-        _target    = compute_target(shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type);
-        _reference = compute_reference(shape_a, shape_b, output_shape, alpha, beta, data_type);
+        _target    = compute_target(shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type, accumulate);
+        _reference = compute_reference(shape_a, shape_b, output_shape, alpha, beta, data_type, accumulate);
     }
 
 protected:
@@ -80,7 +80,7 @@ protected:
     }
 
     TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c, const TensorShape &output_shape, float alpha, float beta,
-                              DataType data_type)
+                              DataType data_type, bool accumulate=false)
     {
         // Create tensors
         TensorType a   = create_tensor<TensorType>(shape_a, data_type, 1);
@@ -98,12 +98,14 @@ protected:
                        (disable_c) ? nullptr : &c,
                        &dst,
                        alpha, beta,
-                       GEMMInfo(false, false, false, (reinterpret_output_as_3d ? output_shape[2] : 0), reinterpret_input_as_3d, false, GEMMLowpOutputStageInfo(), false, (reinterpret_input_as_3d
-                                || reinterpret_output_as_3d)));
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+                       GEMMInfo(false, false, false, (reinterpret_output_as_3d ? output_shape[2] : 0), reinterpret_input_as_3d, false, GEMMLowpOutputStageInfo(), false, false, (reinterpret_input_as_3d
+                                || reinterpret_output_as_3d), arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED, false /* pretranspose_B */, accumulate));
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(c.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        add_padding_x({ &a, &b, &c, &dst });
 
         // Allocate tensors
         a.allocator()->allocate();
@@ -111,18 +113,33 @@ protected:
         c.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!c.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(a), 0);
         fill(AccessorType(b), 1);
+        if (accumulate)
+        {
+            fill(AccessorType(dst), 6);
+        }
         if(!disable_c)
         {
             fill(AccessorType(c), 2);
         }
+        // Run with variable inputs.
+        if(run_twice)
+        {
+            gemm.run();
+            fill(AccessorType(a), 3); // Fill tensors with new seed after run
+            fill(AccessorType(b), 4);
+            if(!disable_c)
+            {
+                fill(AccessorType(c), 5);
+            }
+        }
 
         // Compute GEMM function
         gemm.run();
@@ -131,10 +148,9 @@ protected:
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &output_shape, float alpha, float beta,
-                                      DataType data_type)
+                                      DataType data_type, bool accumulate=false)
     {
         TensorShape shape_a_to_use = shape_a;
-
         if(reinterpret_input_as_3d)
         {
             // Collapse the second and third dimension if the input is 3D
@@ -145,6 +161,7 @@ protected:
         SimpleTensor<T> a{ shape_a_to_use, data_type, 1 };
         SimpleTensor<T> b{ shape_b, data_type, 1 };
         SimpleTensor<T> c{ output_shape, data_type, 1 };
+        SimpleTensor<T> dst{ output_shape, data_type, 1 };
 
         // Fill reference
         fill(a, 0);
@@ -157,27 +174,96 @@ protected:
             const int m          = reinterpret_output_as_3d ? output_shape[1] * output_shape[2] : output_shape[1];
             const int batch_size = reinterpret_output_as_3d ? output_shape[3] : output_shape[2];
 
-            // In case of broadcast, we need simply copy the first into the following "M" ones
+            // In case of broadcast, we need to simply copy the first into the following "M" ones
             for(int i = 1; i < m * batch_size; i++)
             {
                 memcpy(c.data() + i * n, c.data(), n * sizeof(T));
             }
         }
 
+        /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M),
+           therefore, A must be pre-transposed before passing it to the fixture. And, we transpose A again in the fixture to make it (B x M x K)
+           in order to be able to call reference implementation that works with (B x M x K) input.
+           Similarly, if pretranspose_B is set to true, then B is assumed to be (B x N x K), B must be pre-transposed before passing it to the fixture. */
+
+        // Define transposed shapes
+        TensorShape a_transposed_shape(a.shape().y(), a.shape().x());
+        TensorShape b_transposed_shape(b.shape().y(), b.shape().x());
+
+        // Define transposed tensors
+        SimpleTensor<T> a_transposed{ a_transposed_shape, data_type };
+        SimpleTensor<T> b_transposed{ b_transposed_shape, data_type };
+
+        // pretranspose a if necessary
+        if(pretranspose_a)
+        {
+            transpose_matrix<T>(a, a_transposed);
+        }
+
+        // pretranspose b if necessary
+        if(pretranspose_b)
+        {
+            transpose_matrix<T>(b, b_transposed);
+        }
+
+        // Run with variable inputs.
+        if(run_twice)
+        {
+            reference::gemm<T>((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, alpha, disable_c ? 0.f : beta);
+            fill((pretranspose_a) ? a_transposed : a, 3);
+            fill((pretranspose_b) ? b_transposed : b, 4);
+            fill(c, 5);
+        }
+
+        // Do in place summation
+        if (accumulate)
+        {
+            fill(dst, 6);
+        }
+
         // Setting beta to 0 will effectively disable C for the
         // computation of the reference: alpha * A * B + 0 * C
-        return reference::gemm<T>(a, b, c, alpha, disable_c ? 0.f : beta);
+        // Use transposed tensors if boolean enabled else use original tensors
+        if (accumulate)
+        {
+            reference::gemm_accumulate<T>((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, alpha, disable_c ? 0.f : beta, dst);
+            return dst;
+        }
+        else
+        {
+            return reference::gemm<T>((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, alpha, disable_c ? 0.f : beta);
+        }
     }
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool disable_c = false, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool pretranspose_a = false, bool pretranspose_b = false, bool run_twice = false>
+class GEMMValidationFixture : protected GEMMGenericValidationFixture<TensorType, AccessorType, FunctionType, T, disable_c, reinterpret_input_as_3d, reinterpret_output_as_3d, pretranspose_a, pretranspose_b, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type)
+    {
+        GEMMGenericValidationFixture<TensorType, AccessorType, FunctionType, T, disable_c, reinterpret_input_as_3d, reinterpret_output_as_3d, pretranspose_a, pretranspose_b, run_twice>::setup(shape_a, shape_b, shape_c, output_shape, alpha, beta, pretranspose, data_type, false /*accumulate*/);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool disable_c = false, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool pretranspose_a = false, bool pretranspose_b = false, bool run_twice = false>
+class GEMMAccumulateValidationFixture : protected GEMMGenericValidationFixture<TensorType, AccessorType, FunctionType, T, disable_c, reinterpret_input_as_3d, reinterpret_output_as_3d, pretranspose_a, pretranspose_b, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type)
+    {
+        bool accumulate = true;
+        GEMMGenericValidationFixture<TensorType, AccessorType, FunctionType, T, disable_c, reinterpret_input_as_3d, reinterpret_output_as_3d, pretranspose_a, pretranspose_b, run_twice>::setup(shape_a, shape_b, shape_c, output_shape, alpha, beta, pretranspose, data_type, accumulate);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
 class GEMMMatrixMultiplyValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, float alpha, float beta, bool broadcast_bias, bool fp16_mixed_precision, const ActivationLayerInfo &act_info,
                DataType data_type, GPUTarget gpu_arch)
     {
@@ -224,12 +310,14 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        GEMMFunctionType gemm;
-        gemm.configure(gpu_arch, &lhs, &rhs, &bias, &dst, alpha, beta, false, reshape_info, fp16_mixed_precision, act_info);
+        GEMMOperatorType gemm;
+        gemm.configure(gpu_arch, lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, false, reshape_info, fp16_mixed_precision, act_info);
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        add_padding_x({ &lhs, &rhs, &bias, &dst });
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -237,10 +325,10 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
@@ -248,7 +336,11 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
+            { ACL_SRC_1, &rhs },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -276,7 +368,7 @@ protected:
 
         if(broadcast_bias)
         {
-            // In case of broadcast, we need simply copy the first into the following "M" ones
+            // In case of broadcast, we need to simply copy the first into the following "M" ones
             for(int i = 1; i < m * batch_size; i++)
             {
                 memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
@@ -290,11 +382,10 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
 class GEMMMatrixMultiply3DValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, float alpha, float beta, bool broadcast_bias, bool fp16_mixed_precision,
                const ActivationLayerInfo &act_info, DataType data_type, GPUTarget gpu_arch)
     {
@@ -340,12 +431,14 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        GEMMFunctionType gemm;
-        gemm.configure(gpu_arch, &lhs, &rhs, &bias, &dst, alpha, beta, false, reshape_info, fp16_mixed_precision, act_info);
+        GEMMOperatorType gemm;
+        gemm.configure(gpu_arch, lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, false, reshape_info, fp16_mixed_precision, act_info);
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        add_padding_x({ &lhs, &rhs, &bias, &dst });
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -353,10 +446,10 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
@@ -364,7 +457,11 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
+            { ACL_SRC_1, &rhs },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -392,7 +489,7 @@ protected:
         fill(rhs, 1);
         fill(bias, 2);
 
-        // In case of broadcast, we need simply copy the first into the following "M" ones
+        // In case of broadcast, we need to simply copy the first into the following "M" ones
         for(int i = 1; i < m * batch_size; i++)
         {
             memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
@@ -405,11 +502,10 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSFunctionType, typename ReshapeRHSFunctionType, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
 class GEMMMatrixMultiplyInterleavedTransposedValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, float alpha, float beta, unsigned int v0, unsigned int h0, bool broadcast_bias, bool fp16_mixed_precision,
                const ActivationLayerInfo &act_info, DataType data_type, GPUTarget gpu_arch)
     {
@@ -472,16 +568,22 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        ReshapeLHSFunctionType reshape_lhs;
-        ReshapeRHSFunctionType reshape_rhs;
-        GEMMFunctionType       gemm;
-        reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info);
-        reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
-        gemm.configure(gpu_arch, &lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, true, reshape_info, fp16_mixed_precision, act_info);
-
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ReshapeLHSOperatorType reshape_lhs;
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMOperatorType       gemm;
+        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(gpu_arch, lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, true, reshape_info, fp16_mixed_precision, act_info);
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+
+        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
+        if(!rhs_info.export_to_cl_image)
+        {
+            add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
+        }
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -491,12 +593,12 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!lhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
@@ -504,9 +606,15 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
-        reshape_lhs.run();
-        reshape_rhs.run();
-        gemm.run();
+        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
+        reshape_lhs.run(reshape_lhs_pack);
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
+            { ACL_SRC_1, &rhs_reshaped },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -534,7 +642,7 @@ protected:
 
         if(broadcast_bias)
         {
-            // In case of broadcast, we need simply copy the first into the following "M" ones
+            // In case of broadcast, we need to simply copy the first into the following "M" ones
             for(int i = 1; i < m * batch_size; i++)
             {
                 memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
@@ -548,11 +656,10 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSFunctionType, typename ReshapeRHSFunctionType, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
 class GEMMMatrixMultiplyInterleavedTransposed3DValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, float alpha, float beta, unsigned int v0, unsigned int h0, bool broadcast_bias,
                bool fp16_mixed_precision, const ActivationLayerInfo &act_info, DataType data_type, GPUTarget gpu_arch)
     {
@@ -614,16 +721,22 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        ReshapeLHSFunctionType reshape_lhs;
-        ReshapeRHSFunctionType reshape_rhs;
-        GEMMFunctionType       gemm;
-        reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info);
-        reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
-        gemm.configure(gpu_arch, &lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, true, reshape_info, fp16_mixed_precision, act_info);
-
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ReshapeLHSOperatorType reshape_lhs;
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMOperatorType       gemm;
+        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(gpu_arch, lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, true, reshape_info, fp16_mixed_precision, act_info);
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+
+        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
+        if(!rhs_info.export_to_cl_image)
+        {
+            add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
+        }
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -633,12 +746,12 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!lhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
@@ -646,9 +759,15 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
-        reshape_lhs.run();
-        reshape_rhs.run();
-        gemm.run();
+        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
+        reshape_lhs.run(reshape_lhs_pack);
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
+            { ACL_SRC_1, &rhs_reshaped },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -676,7 +795,7 @@ protected:
         fill(rhs, 1);
         fill(bias, 2);
 
-        // In case of broadcast, we need simply copy the first into the following "M" ones
+        // In case of broadcast, we need to simply copy the first into the following "M" ones
         for(int i = 1; i < m * batch_size; i++)
         {
             memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
@@ -689,11 +808,10 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSFunctionType, typename ReshapeRHSFunctionType, typename GEMMFunctionType, bool fp_mixed_precision = false>
+template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
 class GEMMMatrixMultiplyReshapedValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool interleave_lhs,
                bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info)
     {
@@ -768,9 +886,9 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        ReshapeLHSFunctionType reshape_lhs;
-        ReshapeRHSFunctionType reshape_rhs;
-        GEMMFunctionType       gemm;
+        ReshapeLHSOperatorType reshape_lhs;
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMOperatorType       gemm;
 
         validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
         validate_result = validate_result || !rhs_info.export_to_cl_image;
@@ -779,13 +897,19 @@ protected:
             return nullptr;
         }
 
-        reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info);
-        reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
-        gemm.configure(&lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
+        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+
+        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
+        if(!rhs_info.export_to_cl_image)
+        {
+            add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
+        }
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -795,12 +919,12 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!lhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
@@ -808,9 +932,15 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
-        reshape_lhs.run();
-        reshape_rhs.run();
-        gemm.run();
+        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
+        reshape_lhs.run(reshape_lhs_pack);
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
+            { ACL_SRC_1, &rhs_reshaped },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -838,7 +968,7 @@ protected:
 
         if(broadcast_bias)
         {
-            // In case of broadcast, we need simply copy the first into the following "M" ones
+            // In case of broadcast, we need to simply copy the first into the following "M" ones
             for(int i = 1; i < m * batch_size; i++)
             {
                 memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
@@ -860,11 +990,10 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSFunctionType, typename ReshapeRHSFunctionType, typename GEMMFunctionType, bool fp_mixed_precision = false>
+template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
 class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
                bool interleave_lhs, bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool lhs_transpose, const ActivationLayerInfo &act_info)
     {
@@ -936,9 +1065,9 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        ReshapeLHSFunctionType reshape_lhs;
-        ReshapeRHSFunctionType reshape_rhs;
-        GEMMFunctionType       gemm;
+        ReshapeLHSOperatorType reshape_lhs;
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMOperatorType       gemm;
 
         validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
         validate_result = validate_result || !rhs_info.export_to_cl_image;
@@ -947,13 +1076,19 @@ protected:
             return nullptr;
         }
 
-        reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info);
-        reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
-        gemm.configure(&lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
+        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+
+        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
+        if(!rhs_info.export_to_cl_image)
+        {
+            add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
+        }
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -963,12 +1098,12 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!lhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
@@ -976,9 +1111,15 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
-        reshape_lhs.run();
-        reshape_rhs.run();
-        gemm.run();
+        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
+        reshape_lhs.run(reshape_lhs_pack);
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
+            { ACL_SRC_1, &rhs_reshaped },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1006,7 +1147,7 @@ protected:
         fill(rhs, 1);
         fill(bias, 2);
 
-        // In case of broadcast, we need simply copy the first into the following "M" ones
+        // In case of broadcast, we need to simply copy the first into the following "M" ones
         for(int i = 1; i < m * batch_size; i++)
         {
             memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
@@ -1027,11 +1168,10 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSFunctionType, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
 class GEMMMatrixMultiplyReshapedOnlyRHSValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0,
                bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info)
     {
@@ -1101,8 +1241,8 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        ReshapeRHSFunctionType reshape_rhs;
-        GEMMFunctionType       gemm;
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMOperatorType       gemm;
 
         validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
         validate_result = validate_result || !rhs_info.export_to_cl_image;
@@ -1111,12 +1251,18 @@ protected:
             return nullptr;
         }
 
-        reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
-        gemm.configure(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+
+        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
+        if(!rhs_info.export_to_cl_image)
+        {
+            add_padding_x({ &lhs, &rhs, &rhs_reshaped, &bias, &dst });
+        }
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -1125,11 +1271,11 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
@@ -1137,8 +1283,13 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
-        reshape_rhs.run();
-        gemm.run();
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
+            { ACL_SRC_1, &rhs_reshaped },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1166,7 +1317,7 @@ protected:
 
         if(broadcast_bias)
         {
-            // In case of broadcast, we need simply copy the first into the following "M" ones
+            // In case of broadcast, we need to simply copy the first into the following "M" ones
             for(int i = 1; i < m * batch_size; i++)
             {
                 memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
@@ -1181,11 +1332,10 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSFunctionType, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
 class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0,
                bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, bool has_pad_y, DataType data_type, float alpha, float beta, const ActivationLayerInfo &act_info)
     {
@@ -1253,8 +1403,8 @@ protected:
 
         // The output tensor will be auto-initialized within the function
         // Create and configure function
-        ReshapeRHSFunctionType reshape_rhs;
-        GEMMFunctionType       gemm;
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMOperatorType       gemm;
 
         validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
         validate_result = validate_result || !rhs_info.export_to_cl_image;
@@ -1263,8 +1413,8 @@ protected:
             return nullptr;
         }
 
-        reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
-        gemm.configure(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
 
         if(has_pad_y)
         {
@@ -1273,9 +1423,15 @@ protected:
             dst.info()->extend_padding(PaddingSize(2, 0, 1, 0));
         }
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+
+        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
+        if(!rhs_info.export_to_cl_image)
+        {
+            add_padding_x({ &lhs, &rhs, &rhs_reshaped, &bias, &dst });
+        }
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -1284,11 +1440,11 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
@@ -1296,8 +1452,13 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
-        reshape_rhs.run();
-        gemm.run();
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
+            { ACL_SRC_1, &rhs_reshaped },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1325,7 +1486,7 @@ protected:
         fill(rhs, 1);
         fill(bias, 2);
 
-        // In case of broadcast, we need simply copy the first into the following "M" ones
+        // In case of broadcast, we need to simply copy the first into the following "M" ones
         for(int i = 1; i < m * batch_size; i++)
         {
             memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
@@ -1339,11 +1500,10 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
 class GEMMMatrixMultiplyNativeValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, DataType data_type, float alpha, float beta, bool broadcast_bias,
                const ActivationLayerInfo &act_info)
     {
@@ -1403,12 +1563,14 @@ protected:
         kernel_info.activation_info         = act_info;
 
         // Create and configure function
-        GEMMFunctionType gemm;
-        gemm.configure(&lhs, &rhs, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
+        GEMMOperatorType gemm;
+        gemm.configure(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        add_padding_x({ &lhs, &rhs, &bias, &dst });
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -1416,10 +1578,10 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
@@ -1427,7 +1589,11 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
+            { ACL_SRC_1, &rhs },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1455,7 +1621,7 @@ protected:
 
         if(broadcast_bias)
         {
-            // In case of broadcast, we need simply copy the first into the following "M" ones
+            // In case of broadcast, we need to simply copy the first into the following "M" ones
             for(int i = 1; i < m * batch_size; i++)
             {
                 memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
@@ -1469,11 +1635,10 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename T, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
 class GEMMMatrixMultiplyNative3DValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, DataType data_type, float alpha, float beta,
                const ActivationLayerInfo &act_info)
     {
@@ -1532,12 +1697,14 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        GEMMFunctionType gemm;
-        gemm.configure(&lhs, &rhs, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
+        GEMMOperatorType gemm;
+        gemm.configure(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+
+        add_padding_x({ &lhs, &rhs, &bias, &dst });
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -1545,10 +1712,10 @@ protected:
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
@@ -1556,7 +1723,11 @@ protected:
         fill(AccessorType(bias), 2);
 
         // Compute GEMM
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
+            { ACL_SRC_1, &rhs },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1584,7 +1755,7 @@ protected:
         fill(rhs, 1);
         fill(bias, 2);
 
-        // In case of broadcast, we need simply copy the first into the following "M" ones
+        // In case of broadcast, we need to simply copy the first into the following "M" ones
         for(int i = 1; i < m * batch_size; i++)
         {
             memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
@@ -1597,7 +1768,170 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
+template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
+class GEMMMatrixMultiplyReshapedOnlyRhsMMULValidationFixture : public framework::Fixture
+{
+public:
+    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, bool export_to_cl_image, DataType data_type, float alpha,
+               float beta, bool broadcast_bias,
+               const ActivationLayerInfo &act_info)
+    {
+        GEMMLHSMatrixInfo lhs_info;
+        lhs_info.m0 = m0;
+        lhs_info.k0 = k0;
+
+        GEMMRHSMatrixInfo rhs_info;
+        rhs_info.n0                 = n0;
+        rhs_info.k0                 = k0;
+        rhs_info.interleave         = true;
+        rhs_info.transpose          = false;
+        rhs_info.h0                 = 4;
+        rhs_info.export_to_cl_image = export_to_cl_image;
+
+        // Set the tensor shapes for LHS and RHS matrices
+        const TensorShape lhs_shape(k, m, batch_size);
+        const TensorShape rhs_shape(n, k, batch_size);
+        const TensorShape bias_shape(n,
+                                     broadcast_bias ? 1 : m,
+                                     broadcast_bias ? 1 : batch_size);
+
+        _target    = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info);
+        _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
+        using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+
+        DistributionType distribution{ T(-1.0f), T(1.0f) };
+        library->fill(tensor, distribution, i);
+
+        // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
+        DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
+        library->fill_borders_with_garbage(tensor, distribution_inf, i);
+    }
+
+    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info)
+    {
+        // Create tensors
+        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
+        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1);
+        TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
+        TensorType rhs_reshaped;
+        TensorType dst;
+
+        const unsigned int M = lhs_shape[1];
+        const unsigned int N = rhs_shape[0];
+        const unsigned int K = lhs_shape[0];
+        GEMMKernelInfo     kernel_info;
+        kernel_info.m                       = M;
+        kernel_info.n                       = N;
+        kernel_info.k                       = K;
+        kernel_info.depth_output_gemm3d     = 0;
+        kernel_info.reinterpret_input_as_3d = false;
+        kernel_info.broadcast_bias          = broadcast_bias;
+        kernel_info.activation_info         = act_info;
+
+        // Create and configure function
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMOperatorType       gemm;
+
+        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
+        if(!validate_result)
+        {
+            return nullptr;
+        }
+
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+
+        validate_result = bool(gemm.validate(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info));
+        if(!validate_result)
+        {
+            return nullptr;
+        }
+
+        gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+
+        // Allocate tensors
+        lhs.allocator()->allocate();
+        rhs.allocator()->allocate();
+        rhs_reshaped.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        fill(AccessorType(lhs), 0);
+        fill(AccessorType(rhs), 1);
+        fill(AccessorType(bias), 2);
+
+        // Compute GEMM
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
+            { ACL_SRC_1, &rhs_reshaped },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
+                                      const ActivationLayerInfo &act_info)
+    {
+        if(!validate_result)
+            return SimpleTensor<T>();
+
+        TensorShape dst_shape = lhs_shape;
+        dst_shape[0]          = rhs_shape[0];
+        dst_shape[1]          = lhs_shape[1];
+
+        // Create reference
+        SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
+        SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
+        SimpleTensor<T> bias{ dst_shape, data_type, 1 };
+
+        const int n          = rhs_shape[0];
+        const int m          = lhs_shape[1];
+        const int batch_size = lhs_shape[2];
+
+        // Fill reference
+        fill(lhs, 0);
+        fill(rhs, 1);
+        fill(bias, 2);
+
+        if(broadcast_bias)
+        {
+            // In case of broadcast, we need to simply copy the first into the following "M" ones
+            for(int i = 1; i < m * batch_size; i++)
+            {
+                memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
+            }
+        }
+
+        return reference::activation_layer(reference::gemm<T>(lhs, rhs, bias, alpha, beta), act_info);
+    }
+
+    bool            validate_result = true;
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMM_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
diff --git a/tests/validation/fixtures/GEMMInterleave4x4Fixture.h b/tests/validation/fixtures/GEMMInterleave4x4Fixture.h
index 1ce0eafead..59fc460869 100644
--- a/tests/validation/fixtures/GEMMInterleave4x4Fixture.h
+++ b/tests/validation/fixtures/GEMMInterleave4x4Fixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class GEMMInterleave4x4ValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(size_t x, size_t y, DataType data_type)
     {
         _data_type = data_type;
@@ -88,24 +87,25 @@ protected:
 
         // Create and configure function
         FunctionType f;
-        f.configure(&a, &b);
+        f.configure(a.info(), b.info());
 
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b.info()->is_resizable());
 
         // Allocate tensors
         a.allocator()->allocate();
         b.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(a), 0);
         fill(AccessorType(b), 0);
 
-        // Compute GEMM function
-        f.run();
+        // Compute GEMM interleave kernel
+        ITensorPack tensors{ { ACL_SRC, &a }, { ACL_DST, &b } };
+        f.run(tensors);
         return b;
     }
 
diff --git a/tests/validation/fixtures/GEMMLowpAssemblyFixture.h b/tests/validation/fixtures/GEMMLowpAssemblyFixture.h
deleted file mode 100644
index e9ec1bc365..0000000000
--- a/tests/validation/fixtures/GEMMLowpAssemblyFixture.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_GEMMLOWP_ASSEMBLY_FIXTURE
-#define ARM_COMPUTE_TEST_GEMMLOWP_ASSEMBLY_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/GEMMLowp.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T2>
-class GEMMLowpAssemblyFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(size_t m, size_t n, size_t k)
-    {
-        const TensorShape shape_a(k, m);
-        const TensorShape shape_b(n, k);
-        const TensorShape shape_c(n, m);
-        _target    = compute_target(shape_a, shape_b, shape_c);
-        _reference = compute_reference(shape_a, shape_b, shape_c);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i, int lo, int hi)
-    {
-        std::uniform_int_distribution<> distribution(lo, hi);
-        library->fill(tensor, distribution, i);
-    }
-
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c)
-    {
-        DataType dt_in = std::is_same<T2, int8_t>::value ? DataType::S8 : DataType::U8;
-
-        // Create tensors
-        TensorType a = create_tensor<TensorType>(shape_a, dt_in, 1);
-        TensorType b = create_tensor<TensorType>(shape_b, dt_in, 1);
-        TensorType c = create_tensor<TensorType>(shape_c, DataType::S32, 1);
-
-        // Create and configure function
-        FunctionType gemmlowp;
-        gemmlowp.configure(&a, &b, nullptr, &c);
-
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        a.allocator()->allocate();
-        b.allocator()->allocate();
-        c.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        if(dt_in == DataType::S8)
-        {
-            fill(AccessorType(a), 0, -128, 127);
-            fill(AccessorType(b), 1, -128, 127);
-        }
-        else
-        {
-            fill(AccessorType(a), 0, 0, 255);
-            fill(AccessorType(b), 1, 0, 255);
-        }
-        fill(AccessorType(c), 2, 0, 0);
-
-        // Compute GEMM function
-        gemmlowp.run();
-        return c;
-    }
-
-    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c)
-    {
-        DataType dt = std::is_same<T2, int8_t>::value ? DataType::S8 : DataType::U8;
-
-        // Create reference
-        SimpleTensor<T2> a{ shape_a, dt, 1 };
-        SimpleTensor<T2> b{ shape_b, dt, 1 };
-
-        // Fill reference
-        if(dt == DataType::S8)
-        {
-            fill(a, 0, -128, 127);
-            fill(b, 1, -128, 127);
-        }
-        else
-        {
-            fill(a, 0, 0, 255);
-            fill(b, 1, 0, 255);
-        }
-
-        return reference::gemmlowp<int32_t, T2>(a, b, shape_c);
-    }
-
-    TensorType            _target{};
-    SimpleTensor<int32_t> _reference{};
-};
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE */
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 95f49601a5..aa4eedb75d 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,22 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE
-#define ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_GEMMLOWPFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_GEMMLOWPFIXTURE_H
 
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
+#include "src/core/utils/quantization/AsymmHelpers.h"
 #include "tests/validation/Helpers.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Validation.h"
 #include "tests/validation/reference/GEMMLowp.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
+#include "tests/validation/reference/DequantizationLayer.h"
 
-#include <random>
+#include <cstdint>
+#include <vector>
 
 namespace arm_compute
 {
@@ -49,119 +47,145 @@ namespace
 template <typename U>
 void fill(U &&tensor, int i)
 {
-    switch(tensor.data_type())
-    {
-        case DataType::QSYMM8_PER_CHANNEL:
-        {
-            int min_bound = 128;
-            int max_bound = -127;
-            for(size_t j = 0; j < tensor.quantization_info().scale().size(); j++)
-            {
-                std::pair<int, int> bounds = get_symm_quantized_per_channel_bounds(tensor.quantization_info(), -1.0f, 1.0f, i);
-                if(bounds.first < min_bound)
-                {
-                    min_bound = bounds.first;
-                }
-                if(bounds.second > max_bound)
-                {
-                    max_bound = bounds.second;
-                }
-            }
-            std::uniform_int_distribution<int8_t> distribution(min_bound, max_bound);
-            library->fill(tensor, distribution, i);
-            break;
-        }
-        case DataType::QASYMM8:
-        {
-            std::uniform_int_distribution<uint8_t> distribution(1, 254);
-            library->fill(tensor, distribution, i);
-            break;
-        }
-        case DataType::F16:
-        {
-            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
-            library->fill(tensor, distribution, i);
-            break;
-        }
-        case DataType::F32:
-        {
-            std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
-            library->fill(tensor, distribution, i);
-            break;
-        }
-        default:
-            library->fill_tensor_uniform(tensor, i);
-    }
+    library->fill_tensor_uniform(tensor, i);
 }
 
-template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d, bool reinterpret_output_as_3d, typename OutputType, bool is_fused = false>
-TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
-                                   GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8,
-                                   QuantizationInfo b_qinfo = QuantizationInfo())
+template <typename U>
+void fill_quantized(U &&tensor, int i)
 {
-    // Create tensors
-    DataType data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a;
-
-    TensorType a      = create_tensor<TensorType>(shape_a, data_type_a, 1);
-    TensorType b      = create_tensor<TensorType>(shape_b, data_type_b, 1); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
-    TensorType output = create_tensor<TensorType>(shape_output, data_type_output, 1);
-
-    a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
+    ARM_COMPUTE_ASSERT(is_data_type_quantized(tensor.data_type()));
+    library->fill_tensor_uniform(tensor, i);
+}
 
-    if(data_type_b == DataType::QSYMM8_PER_CHANNEL)
+template <typename U>
+void fill(U &&tensor, int i, int32_t min, int32_t max)
+{
+    if (tensor.data_type() == DataType::S32) {
+        std::uniform_int_distribution<int32_t> distribution(min, max);
+        library->fill(tensor, distribution, i);
+    }
+    else if(tensor.data_type() == DataType::F32)
     {
-        b.info()->set_quantization_info(b_qinfo);
+        std::uniform_real_distribution<float> distribution((float)min, (float)max);
+        library->fill(tensor, distribution, i);
     }
     else
     {
-        b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
+        ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+}
+
+/** Information about how to fill tensors */
+struct TensorFillInfo
+{
+    // Bias fill range. Default values are arbitrary
+    int32_t min_bias {-20000};
+    int32_t max_bias {20000};
+
+    // Output fill range. Default values are arbitrary
+    int32_t min_output {-20000};
+    int32_t max_output {20000};
+
+    // Optional extra hash to randomize tensor filling
+    int32_t hash     {0};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d, bool reinterpret_output_as_3d, typename OutputType, bool is_fused = false, bool run_twice = false>
+TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
+                                   const QuantizationInfo& output_qinfo, DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8,
+                                   GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(),
+                                   bool accumulate = false, bool dynamic_qinfo = false, DataType data_type_output = DataType::UNKNOWN)
+{
+    ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a));
+    ARM_COMPUTE_ASSERT(data_type_a == data_type_b);
+    // If unknown, set to sensible defaults
+    if (data_type_output == DataType::UNKNOWN) {
+        data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a;
     }
+
+    // Create tensors
+    TensorType a      = create_tensor<TensorType>(shape_a, data_type_a, 1, dynamic_qinfo ? QuantizationInfo(1.0,0,true) : a_qinfo);
+    TensorType b      = create_tensor<TensorType>(shape_b, data_type_b, 1, dynamic_qinfo ? QuantizationInfo(1.0,0,true) : b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
+    TensorType output = create_tensor<TensorType>(shape_output, data_type_output, 1, output_qinfo /* output_qinfo will be ignored when output stage type is None */);
+
     TensorType bias;
     if(is_fused)
     {
         TensorShape bias_shape(shape_b[0]);
-        bias = create_tensor<TensorType>(bias_shape, DataType::S32, 1);
+        bias = create_tensor<TensorType>(bias_shape,data_type_output == DataType::F32 ? DataType::F32 : DataType::S32, 1);
     }
 
     // Create and configure function
     // The GEMMinfo includes the values of the depth in case of reinterpreted 3d input/output
     FunctionType gemmlowp;
-    // TODO (COMPMID-1672) - Extending the test to validate add bias in offset contribution
-    gemmlowp.configure(&a, &b, is_fused ? &bias : nullptr, &output, GEMMInfo(false, false, false, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false, output_stage));
+    gemmlowp.configure(&a, &b, is_fused ? &bias : nullptr, &output, GEMMInfo(false, false, reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false,
+                                                                             output_stage, false /*fp_mixed_precision*/, false /*fast_math*/, false /*broadcast_bias*/,
+                                                                             arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED,
+                                                                             false /* pretranspose_B */, accumulate));
+
+    // If the QuantizationInfo is dynamic, it needs to be settable after configure (note that we also force it to be dynamic)
+    if (dynamic_qinfo)
+    {
+        a.info()->set_quantization_info(QuantizationInfo(a_qinfo.scale(), a_qinfo.offset(), true));
+        b.info()->set_quantization_info(QuantizationInfo(b_qinfo.scale(), b_qinfo.offset(), true));
+    }
+
+    ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(b.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(output.info()->is_resizable());
 
-    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(output.info()->is_resizable(), framework::LogLevel::ERRORS);
+    add_padding_x({ &a, &b, &output });
 
     // Allocate tensors
     a.allocator()->allocate();
     b.allocator()->allocate();
     output.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!output.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(!output.info()->is_resizable());
 
     // Fill tensors
-    fill(AccessorType(a), 0);
-    fill(AccessorType(b), 1);
+    fill_quantized(AccessorType(a), 0 + finfo.hash);
+    fill_quantized(AccessorType(b), 1 + finfo.hash);
+
+    if (accumulate)
+    {
+        ARM_COMPUTE_ASSERT(accumulate != run_twice);
+        fill(AccessorType(output), 6 + finfo.hash, finfo.min_output, finfo.max_output);
+    }
 
     if(is_fused)
     {
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
         bias.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        fill(AccessorType(bias), 2);
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        fill(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias);
     }
+
+    // Run with variable inputs.
+    if(run_twice)
+    {
+        gemmlowp.run();
+        fill_quantized(AccessorType(a), 3 + finfo.hash); // Fill tensors with new seed after run
+        fill_quantized(AccessorType(b), 4 + finfo.hash);
+        if(is_fused)
+        {
+            fill(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias);
+        }
+    }
+
     // Compute GEMM function
     gemmlowp.run();
     return output;
 }
 
-template <bool        reinterpret_input_as_3d, typename TI = uint8_t, typename TW = uint8_t>
-SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
-                                                 DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8, QuantizationInfo b_qinfo = QuantizationInfo())
+template <bool reinterpret_input_as_3d, typename TI = uint8_t, typename TW = uint8_t, bool pretranspose_A = false, bool pretranspose_B = false, bool run_twice = false>
+SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
+                                                 DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8, const TensorFillInfo& finfo = TensorFillInfo())
 {
+    ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a));
+    ARM_COMPUTE_ASSERT(data_type_a == data_type_b);
     TensorShape shape_a_to_use = shape_a;
     if(reinterpret_input_as_3d)
     {
@@ -170,101 +194,269 @@ SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, con
     }
 
     // Create reference
-    SimpleTensor<TI> a{ shape_a_to_use, data_type_a, 1 };
-    SimpleTensor<TW> b{ shape_b, data_type_b, 1, data_type_b == DataType::QSYMM8_PER_CHANNEL ? b_qinfo : QuantizationInfo(1.0f / 255, b_offset) };
+    SimpleTensor<TI> a{ shape_a_to_use, data_type_a, 1, a_qinfo };
+    SimpleTensor<TW> b{ shape_b, data_type_b, 1, b_qinfo };
+
+    TensorShape shape_a_to_use_transposed{ shape_a_to_use };
+    TensorShape shape_b_transposed{ shape_b };
+
+    shape_a_to_use_transposed.set(0, shape_a_to_use[1]);
+    shape_a_to_use_transposed.set(1, shape_a_to_use[0]);
+    shape_b_transposed.set(0, shape_b[1]);
+    shape_b_transposed.set(1, shape_b[0]);
+
+    SimpleTensor<TI> a_transposed{ shape_a_to_use_transposed, data_type_a, 1, a_qinfo };
+    SimpleTensor<TW> b_transposed{ shape_b_transposed, data_type_b, 1, b_qinfo };
 
     // Fill reference
-    fill(a, 0);
-    fill(b, 1);
-    return reference::gemmlowp_matrix_multiply_core<int32_t, TI, TW>(a, b, shape_output, a_offset, b_offset);
-}
+    fill_quantized(a, 0 + finfo.hash);
+    fill_quantized(b, 1 + finfo.hash);
+
+    // Transpose reference if required
+    /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M),
+       therefore, A must be pre-transposed before passing it to the fixture. And, we transpose A again in the fixture to make it (B x M x K)
+       in order to be able to call reference implementation that works with (B x M x K) input.
+       Similarly, if pretranspose_B is set to true, then B is assumed to be (B x N x K), B must be pre-transposed before passing it to the fixture. */
+    if(pretranspose_A)
+    {
+        transpose_matrix<TI>(a, a_transposed);
+    }
+
+    if(pretranspose_B)
+    {
+        transpose_matrix<TW>(b, b_transposed);
+    }
+
+    // Run with variable inputs.
+    const int32_t a_offset = a_qinfo.uniform().offset;
+    const int32_t b_offset = b_qinfo.uniform().offset;
+
+    if(run_twice)
+    {
+        reference::gemmlowp_matrix_multiply_core<int32_t, TI, TW>((pretranspose_A ? a_transposed : a), (pretranspose_B ? b_transposed : b), shape_output, a_offset, b_offset);
+        fill_quantized((pretranspose_A) ? a_transposed : a, 3 + finfo.hash);
+        fill_quantized((pretranspose_B) ? b_transposed : b, 4 + finfo.hash);
+    }
+
+    return reference::gemmlowp_matrix_multiply_core<int32_t, TI, TW>((pretranspose_A ? a_transposed : a), (pretranspose_B ? b_transposed : b), shape_output, a_offset, b_offset);
 }
+} // namespace
 
-template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false>
-class GEMMLowpMatrixMultiplyCoreValidationFixture : public framework::Fixture
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpGenericMatrixMultiplyCoreValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, bool accumulate=false, bool dynamic_qinfo = false)
     {
-        _target    = compute_target(shape_a, shape_b, shape_output, a_offset, b_offset);
-        _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, b_offset);
+        const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
+        const auto b_qinfo = QuantizationInfo(1.0f / 255, b_offset);
+        TensorFillInfo finfo;
+        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
+        _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate);
     }
 
 protected:
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset)
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo)
     {
-        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t>(shape_a, shape_b, shape_output, a_offset, b_offset);
+        const auto output_qinfo = QuantizationInfo(); // No output stage
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, DataType::QASYMM8, DataType::QASYMM8, GEMMLowpOutputStageInfo(), false, finfo, accumulate, dynamic_qinfo);
     }
 
-    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset)
+    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, bool accumulate)
     {
-        return compute_gemmlowp_reference<reinterpret_input_as_3d>(shape_a, shape_b, shape_output, a_offset, b_offset);
+        SimpleTensor<int32_t> ref_output =  compute_gemmlowp_reference<reinterpret_input_as_3d, uint8_t, uint8_t, false, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo,
+        DataType::QASYMM8, DataType::QASYMM8, finfo);
+
+        if (accumulate)
+        {
+            SimpleTensor<int32_t> output{ shape_output, DataType::S32, 1 };
+            fill(output, 6 + finfo.hash, finfo.min_output, finfo.max_output);
+            reference::arithmetic_operation<int32_t>(reference::ArithmeticOperation::ADD, output, ref_output, output, ConvertPolicy::SATURATE);
+            return output;
+        }
+
+        return ref_output;
     }
 
     TensorType            _target{};
     SimpleTensor<int32_t> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t>
-class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public framework::Fixture
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpMatrixMultiplyCoreValidationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>
 {
 public:
-    template <typename...>
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage, DataType data_type_b)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
     {
-        ARM_COMPUTE_EXPECT(output_stage.type != GEMMLowpOutputStageType::NONE, framework::LogLevel::ERRORS);
-        DataType data_type_a = data_type_b == DataType::QASYMM8_SIGNED ? DataType::QASYMM8_SIGNED : DataType::QASYMM8;
+        GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>::setup(shape_a, shape_b, shape_output, a_offset, b_offset, false /* accumulate */);
+    }
+};
 
-        if(data_type_b == DataType::QSYMM8_PER_CHANNEL)
-        {
-            output_stage.is_quantized_per_channel              = true;
-            const size_t                          num_channels = shape_b[0];
-            std::vector<float>                    scales(num_channels);
-            std::uniform_real_distribution<float> distribution(0.f, 1.f);
-            library->fill(scales, distribution, 0);
-            output_stage.gemmlowp_multipliers.resize(num_channels);
-            output_stage.gemmlowp_shifts.resize(num_channels);
-            for(size_t i = 0; i < num_channels; ++i)
-            {
-                quantization::calculate_quantized_multiplier(scales[i], &output_stage.gemmlowp_multipliers[i], &output_stage.gemmlowp_shifts[i]);
-            }
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpMatrixMultiplyAccumulateValidationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    {
+        GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>::setup(shape_a, shape_b, shape_output, a_offset, b_offset, true /* accumulate */);
+    }
+};
 
-            _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, 0, output_stage, data_type_a, data_type_b, QuantizationInfo(scales));
-            _target    = compute_target(shape_a, shape_b, shape_output, a_offset, 0, output_stage, data_type_a, data_type_b, QuantizationInfo(scales));
-        }
-        else
-        {
-            _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage, data_type_a, data_type_b, QuantizationInfo());
-            _target    = compute_target(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage, data_type_a, data_type_b, QuantizationInfo());
-        }
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    {
+        GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>::setup(shape_a, shape_b, shape_output, a_offset, b_offset, false /* accumulate */, true /* dynamic_qinfo */);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t, bool run_twice = false>
+class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public framework::Fixture
+{
+public:
+    /** Dynamically initialize the quantization info with saturation awareness
+     */
+    template <typename T>
+    static void setup_quantization(DataType data_type, const TensorShape& shape_a, const TensorShape& shape_b, QuantizationInfo& a_qinfo, QuantizationInfo& b_qinfo, QuantizationInfo& output_qinfo, TensorFillInfo& finfo)
+    {
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        finfo.hash = shape_a[0] + shape_a[1] + shape_b[0] + shape_b[1];
+
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + finfo.hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        a_qinfo = QuantizationInfo(scale_lhs, offset_lhs);
+        b_qinfo = QuantizationInfo(scale_rhs, offset_rhs);
+
+        // reinterpret_input_as_3d or reinterpret_output_as_3d can be ignored, as the underlying gemm / matmul computation
+        // is equivalent to a standard 2D one with m-n-k dimensions
+        const int m = shape_a.y();
+        const int n = shape_b.x();
+        const int k = shape_a.x();
+
+        const float bias_fraction = 0.5f; // We enabled is_fused in compute_gemmlowp_target below, thus bias is included
+
+        QuantizationHint q_hint = suggest_matmul_dst_q_info_and_bias(a_qinfo, b_qinfo, m, n, k, data_type, bias_fraction);
+        output_qinfo            = q_hint.q_info;
+        finfo.min_bias          = q_hint.bias_min;
+        finfo.max_bias          = q_hint.bias_max;
+
+        // Both target and reference implementations use negated offsets, i.e.
+        //      float_val = (int_val + offset) * scale
+        // instead of
+        //      float_val = (int_val - offset) * scale
+        // as usual. Therefore, after calculating the output quantization above, we
+        // negate the offsets of inputs' offsets.
+        a_qinfo = QuantizationInfo(scale_lhs, -offset_lhs);
+        b_qinfo = QuantizationInfo(scale_rhs, -offset_rhs);
+    }
+
+    /** Initialize output stage info from quantization info */
+    static Status init_gemmlowp_output_stage_info(
+                                        DataType                data_type,
+                                        const QuantizationInfo& a_qinfo,
+                                        const QuantizationInfo& b_qinfo,
+                                        const QuantizationInfo& output_qinfo,
+                                        GEMMLowpOutputStageType type,
+                                        GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_quantized_asymmetric(data_type));
+
+        const UniformQuantizationInfo aq_unif   = a_qinfo.uniform();
+        const UniformQuantizationInfo bq_unif   = b_qinfo.uniform();
+        const UniformQuantizationInfo oq_unif   = output_qinfo.uniform();
+
+        float   multiplier = (aq_unif.scale * bq_unif.scale) / oq_unif.scale;
+        int32_t int_multiplier;
+        int32_t shift;
+
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &int_multiplier, &shift));
+
+        int32_t type_min             = 0;
+        int32_t type_max             = 0;
+        std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(output_qinfo, ActivationLayerInfo(), data_type);
+
+        gemmlowp_output_stage_info.gemmlowp_real_multiplier = multiplier;
+        gemmlowp_output_stage_info.gemmlowp_multiplier = int_multiplier;
+        gemmlowp_output_stage_info.gemmlowp_multipliers = { int_multiplier };
+        gemmlowp_output_stage_info.gemmlowp_shift      = shift;
+        gemmlowp_output_stage_info.gemmlowp_shifts     = { shift };
+        gemmlowp_output_stage_info.gemmlowp_offset     = oq_unif.offset;
+        gemmlowp_output_stage_info.type                = type;
+        gemmlowp_output_stage_info.gemmlowp_min_bound  = type_min;
+        gemmlowp_output_stage_info.gemmlowp_max_bound  = type_max;
+
+        return Status{};
+    }
+
+    /** Currently this fixture only tests the following data type configurations:
+     *
+     * 1. a and b are of the same data type
+     * 2. The data type is quantized asymmetric
+     *
+     */
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type,
+               bool reshape_b_only_on_first_run)
+    {
+        ARM_COMPUTE_ASSERT(output_stage_type != GEMMLowpOutputStageType::NONE);
+        ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type));
+
+        // Randomized dynamic quantization: randomize quantization info in a way that ensures no result saturation
+        // most of the time
+        QuantizationInfo a_qinfo;
+        QuantizationInfo b_qinfo;
+        QuantizationInfo output_qinfo;
+        TensorFillInfo finfo;
+        setup_quantization<TI>(data_type, shape_a, shape_b, a_qinfo, b_qinfo, output_qinfo, finfo);
+
+        GEMMLowpOutputStageInfo output_stage;
+        init_gemmlowp_output_stage_info(data_type, a_qinfo, b_qinfo, output_qinfo, output_stage_type, output_stage);
+
+        _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type, data_type, output_stage, finfo);
+        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, data_type, data_type, output_stage, reshape_b_only_on_first_run, finfo);
     }
 
 protected:
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage,
-                              DataType data_type_a, DataType data_type_b, QuantizationInfo b_qinfo)
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const QuantizationInfo& output_qinfo,
+                              DataType data_type_a, DataType data_type_b, const GEMMLowpOutputStageInfo& output_stage, bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo())
     {
-        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, qasymm8_t, true>(shape_a, shape_b, shape_output, a_offset, b_offset,
-                output_stage, data_type_a, data_type_b, b_qinfo);
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, qasymm8_t, true, run_twice>(shape_a, shape_b, shape_output, a_qinfo,
+                b_qinfo, output_qinfo, data_type_a, data_type_b, output_stage, reshape_b_only_on_first_run, finfo);
     }
 
-    SimpleTensor<TI> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
-                                       GEMMLowpOutputStageInfo output_stage, DataType data_type_a, DataType data_type_b, QuantizationInfo b_qinfo)
+    SimpleTensor<TI> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
+                                       DataType data_type_a, DataType data_type_b, const GEMMLowpOutputStageInfo& output_stage, const TensorFillInfo& finfo = TensorFillInfo())
     {
-        SimpleTensor<int32_t> output = compute_gemmlowp_reference<reinterpret_input_as_3d, TI, TW>(shape_a, shape_b, shape_output, a_offset, b_offset, data_type_a, data_type_b, b_qinfo);
+        SimpleTensor<int32_t> output = compute_gemmlowp_reference<reinterpret_input_as_3d, TI, TW, false, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type_a, data_type_b, finfo);
 
         TensorShape           bias_shape(shape_b[0]);
         SimpleTensor<int32_t> bias{ bias_shape, DataType::S32, 1 };
-        fill(bias, 2);
+        (run_twice) ? fill(bias, 5 + finfo.hash, finfo.min_bias, finfo.max_bias) : fill(bias, 2 + finfo.hash, finfo.min_bias, finfo.max_bias); // Fill bias with same seed as last run of gemmlowp_target
 
         switch(output_stage.type)
         {
             case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-                return reference::gemmlowp_quantize_down_scale<int32_t, TW>(output, bias,
+                return reference::gemmlowp_quantize_down_scale<int32_t, TI>(output, bias,
                                                                             output_stage.gemmlowp_offset, output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
                 break;
             case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-                return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, TW>(output, bias,
+                return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, TI>(output, bias,
                                                                                           output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_offset, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
                 break;
             default:
@@ -276,11 +468,78 @@ protected:
     SimpleTensor<TI> _reference{};
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpDequantizedMatrixMultiplyValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, bool accumulate)
+    {
+        const bool dynamic_qinfo = false;
+        const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
+        const auto b_qinfo = QuantizationInfo(5.0f / 255, b_offset);
+        TensorFillInfo finfo;
+        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
+        _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
+    }
+
+protected:
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo)
+    {
+        const auto output_qinfo = QuantizationInfo();
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, GEMMLowpOutputStageInfo(), false, finfo, accumulate, dynamic_qinfo, DataType::F32);
+    }
+
+    SimpleTensor<float> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, bool accumulate, const bool dynamic_qinfo)
+    {
+        QuantizationInfo s32_ref_output_quant_info = QuantizationInfo(a_qinfo.uniform().scale * b_qinfo.uniform().scale, 0, dynamic_qinfo);
+
+        SimpleTensor<int32_t> s32_ref_output =  compute_gemmlowp_reference<reinterpret_input_as_3d, int8_t, int8_t, false, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo,
+        DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, finfo);
+        s32_ref_output.quantization_info(s32_ref_output_quant_info);
+
+        SimpleTensor<float> f32_ref_output(s32_ref_output.shape(), DataType::F32);
+        f32_ref_output = reference::dequantization_layer<float, int32_t>(s32_ref_output);
+
+        if (accumulate)
+        {
+            SimpleTensor<float> output{ shape_output, DataType::F32, 1 };
+            fill(output, 6 + finfo.hash, finfo.min_output, finfo.max_output);
+            reference::arithmetic_operation<float>(reference::ArithmeticOperation::ADD, output, f32_ref_output, output, ConvertPolicy::SATURATE);
+            return output;
+        }
+
+        return f32_ref_output;
+    }
+
+    TensorType            _target{};
+    SimpleTensor<float> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t, bool run_twice = false>
+class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type, bool reshape_b_only_on_first_run)
+    {
+        GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW, run_twice>::setup(shape_a, shape_b,
+                shape_output, output_stage_type, data_type, reshape_b_only_on_first_run);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t, bool run_twice = false>
+class GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture : public GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type, bool reshape_b_only_on_first_run)
+    {
+        GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW, run_twice>::setup(shape_a, shape_b, shape_output, output_stage_type, data_type, reshape_b_only_on_first_run);
+    }
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType>
 class GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias)
     {
         _target    = compute_target(shape, result_offset, result_mult_int, result_shift, min, max, add_bias);
@@ -316,27 +575,27 @@ protected:
         output_stage_info.output_data_type        = DataType::QASYMM8;
         output_stage.configure(&a, add_bias ? &b : nullptr, &c, output_stage_info);
 
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(c.info()->is_resizable());
 
         // Allocate tensors
         a.allocator()->allocate();
         c.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!c.info()->is_resizable());
 
         // Fill tensor
         fill(AccessorType(a), 0);
 
         if(add_bias)
         {
-            ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(b.info()->is_resizable());
 
             // Allocate bias tensor
             b.allocator()->allocate();
 
-            ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
 
             // Fill tensor
             fill(AccessorType(b), 1);
@@ -382,7 +641,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType>
 class GEMMLowpQuantizeDownInt32ToInt8ScaleValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias)
     {
         _target    = compute_target(shape, result_offset, result_mult_int, result_shift, min, max, add_bias);
@@ -418,27 +676,27 @@ protected:
         output_stage_info.output_data_type        = DataType::QASYMM8_SIGNED;
         output_stage.configure(&a, add_bias ? &b : nullptr, &c, output_stage_info);
 
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(c.info()->is_resizable());
 
         // Allocate tensors
         a.allocator()->allocate();
         c.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!c.info()->is_resizable());
 
         // Fill tensor
         fill(AccessorType(a), 0);
 
         if(add_bias)
         {
-            ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(b.info()->is_resizable());
 
             // Allocate bias tensor
             b.allocator()->allocate();
 
-            ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
 
             // Fill tensor
             fill(AccessorType(b), 1);
@@ -484,7 +742,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType>
 class GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, int32_t result_fixedpoint_multiplier, int32_t result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max, bool add_bias)
     {
         _target    = compute_target(shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias);
@@ -512,27 +769,27 @@ protected:
         FunctionType output_stage;
         output_stage.configure(&a, add_bias ? &b : nullptr, &c, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
 
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(c.info()->is_resizable());
 
         // Allocate tensors
         a.allocator()->allocate();
         c.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!c.info()->is_resizable());
 
         // Fill tensor
         fill(AccessorType(a), 0);
 
         if(add_bias)
         {
-            ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(b.info()->is_resizable());
 
             // Allocate bias tensor
             b.allocator()->allocate();
 
-            ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
 
             // Fill tensor
             fill(AccessorType(b), 1);
@@ -579,7 +836,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType>
 class GEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, int32_t result_fixedpoint_multiplier, int32_t result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max, bool add_bias)
     {
         _target    = compute_target(shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias);
@@ -607,27 +863,27 @@ protected:
         FunctionType output_stage;
         output_stage.configure(&a, add_bias ? &b : nullptr, &c, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
 
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(c.info()->is_resizable());
 
         // Allocate tensors
         a.allocator()->allocate();
         c.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!c.info()->is_resizable());
 
         // Fill tensor
         fill(AccessorType(a), 0);
 
         if(add_bias)
         {
-            ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(b.info()->is_resizable());
 
             // Allocate bias tensor
             b.allocator()->allocate();
 
-            ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
 
             // Fill tensor
             fill(AccessorType(b), 1);
@@ -674,7 +930,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(DataType data_type, TensorShape shape, float result_real_multiplier, int32_t result_offset, int32_t min, int32_t max, bool add_bias)
     {
         _target    = compute_target(data_type, shape, result_real_multiplier, result_offset, min, max, add_bias);
@@ -712,27 +967,27 @@ protected:
         FunctionType output_stage;
         output_stage.configure(&a, add_bias ? &b : nullptr, &c, info);
 
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(c.info()->is_resizable());
 
         // Allocate tensors
         a.allocator()->allocate();
         c.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!c.info()->is_resizable());
 
         // Fill tensor
         fill(AccessorType(a), 0);
 
         if(add_bias)
         {
-            ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(b.info()->is_resizable());
 
             // Allocate bias tensor
             b.allocator()->allocate();
 
-            ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
 
             // Fill tensor
             fill(AccessorType(b), 1);
@@ -777,7 +1032,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType>
 class GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, int32_t result_fixedpoint_multiplier, int32_t result_shift, int32_t min, int32_t max, bool add_bias)
     {
         _target    = compute_target(shape, result_fixedpoint_multiplier, result_shift, min, max, add_bias);
@@ -805,27 +1059,27 @@ protected:
         FunctionType output_stage;
         output_stage.configure(&a, add_bias ? &b : nullptr, &c, result_fixedpoint_multiplier, result_shift, min, max);
 
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(c.info()->is_resizable());
 
         // Allocate tensors
         a.allocator()->allocate();
         c.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!c.info()->is_resizable());
 
         // Fill tensor
         fill(AccessorType(a), 0);
 
         if(add_bias)
         {
-            ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(b.info()->is_resizable());
 
             // Allocate bias tensor
             b.allocator()->allocate();
 
-            ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
 
             // Fill tensor
             fill(AccessorType(b), 1);
@@ -868,11 +1122,10 @@ protected:
     SimpleTensor<int16_t> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename ReshapeLHSFunctionType, typename ReshapeRHSFunctionType, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMFunctionType>
 class GEMMLowpMatrixMultiplyReshapedValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool interleave_lhs,
                bool interleave_rhs, DataType data_type)
     {
@@ -938,15 +1191,17 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        ReshapeLHSFunctionType reshape_lhs;
-        ReshapeRHSFunctionType reshape_rhs;
+        ReshapeLHSOperatorType reshape_lhs;
+        ReshapeRHSOperatorType reshape_rhs;
         GEMMFunctionType       gemm;
-        reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info);
-        reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
-        gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K));
+        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K));
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+        add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &dst });
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -955,20 +1210,23 @@ protected:
         rhs_reshaped.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!lhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
         fill(AccessorType(rhs), 1);
 
         // Compute GEMM
-        reshape_lhs.run();
-        reshape_rhs.run();
-        gemm.run();
+        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
+        reshape_lhs.run(reshape_lhs_pack);
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1014,11 +1272,10 @@ protected:
     SimpleTensor<int32_t> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename ReshapeLHSFunctionType, typename ReshapeRHSFunctionType, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMFunctionType>
 class GEMMLowpMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
                bool interleave_lhs, bool interleave_rhs, DataType data_type)
     {
@@ -1088,15 +1345,17 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        ReshapeLHSFunctionType reshape_lhs;
-        ReshapeRHSFunctionType reshape_rhs;
+        ReshapeLHSOperatorType reshape_lhs;
+        ReshapeRHSOperatorType reshape_rhs;
         GEMMFunctionType       gemm;
-        reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info);
-        reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
-        gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h));
+        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h));
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+        add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &dst });
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -1105,20 +1364,23 @@ protected:
         rhs_reshaped.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!lhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
         fill(AccessorType(rhs), 1);
 
         // Compute GEMM
-        reshape_lhs.run();
-        reshape_rhs.run();
-        gemm.run();
+        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
+        reshape_lhs.run(reshape_lhs_pack);
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1166,11 +1428,10 @@ protected:
     SimpleTensor<int32_t> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename ReshapeRHSFunctionType, typename GEMMFunctionType>
+template <typename TensorType, typename AccessorType, typename ReshapeRHSOperatorType, typename GEMMFunctionType>
 class GEMMLowpMatrixMultiplyReshapedOnlyRHSValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0,
                unsigned int k0, unsigned int h0, bool interleave_rhs, bool transpose_rhs, DataType data_type)
     {
@@ -1239,13 +1500,15 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        ReshapeRHSFunctionType reshape_rhs;
+        ReshapeRHSOperatorType reshape_rhs;
         GEMMFunctionType       gemm;
-        reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
-        gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info);
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+
+        add_padding_x({ &lhs, &rhs, &rhs_reshaped, &dst });
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -1253,18 +1516,20 @@ protected:
         rhs_reshaped.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
         fill(AccessorType(rhs), 1);
 
         // Compute GEMM
-        reshape_rhs.run();
-        gemm.run();
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1305,11 +1570,372 @@ protected:
     SimpleTensor<int32_t> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename ReshapeRHSFunctionType, typename GEMMFunctionType>
+template <typename T, typename TensorType, typename AccessorType, typename ReshapeRHSOperatorType, typename GEMMFunctionType, typename ReduceOperation, typename CastOperation>
+class GEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageValidationFixture : public framework::Fixture
+{
+public:
+    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0,
+               unsigned int k0, unsigned int h0, bool interleave_rhs, bool transpose_rhs, bool broadcast_bias, DataType data_type)
+    {
+        GEMMLowpOutputStageInfo output_stage;
+        output_stage.type                    = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_stage.output_data_type        = data_type;
+        output_stage.gemmlowp_multipliers    = std::vector<int32_t> { 1 };
+        output_stage.gemmlowp_shifts         = std::vector<int32_t> { 1 };
+        output_stage.gemmlowp_multipliers[0] = 1;
+        output_stage.gemmlowp_shifts[0]      = 1;
+        output_stage.gemmlowp_offset         = 0;
+        constexpr float scale                = 0.001f;
+        quantization::calculate_quantized_multiplier(scale, &output_stage.gemmlowp_multipliers[0], &output_stage.gemmlowp_shifts[0]);
+        output_stage.gemmlowp_min_bound = -100;
+        output_stage.gemmlowp_max_bound = 100;
+
+        GEMMLHSMatrixInfo lhs_info;
+        lhs_info.m0 = m0;
+        lhs_info.k0 = k0;
+
+        GEMMRHSMatrixInfo rhs_info;
+        rhs_info.n0         = n0;
+        rhs_info.k0         = k0;
+        rhs_info.h0         = h0;
+        rhs_info.interleave = interleave_rhs;
+        rhs_info.transpose  = transpose_rhs;
+
+        int a_offset = 1;
+        int b_offset = 1;
+
+        // Set the tensor shapes for LHS and RHS matrices
+        const TensorShape lhs_shape(k, m, batch_size);
+        const TensorShape rhs_shape(n, k, batch_size);
+        const TensorShape bias_shape(n,
+                                     broadcast_bias ? 1 : m,
+                                     broadcast_bias ? 1 : batch_size);
+
+        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, output_stage, a_offset, b_offset);
+        if(gemm_validated == true)
+        {
+            _reference = compute_reference(lhs_shape, rhs_shape, bias_shape, data_type, output_stage, a_offset, b_offset);
+        }
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::QASYMM8:
+            {
+                // Between 1 and 254 in order to avoid having -128 and 128 for the DOT product path
+                std::uniform_int_distribution<> distribution(1, 254);
+                library->fill(tensor, distribution, i);
+            }
+            break;
+            case DataType::QASYMM8_SIGNED:
+            {
+                std::uniform_int_distribution<> distribution(-127, 126);
+                library->fill(tensor, distribution, i);
+            }
+            break;
+            case DataType::S32:
+            {
+                std::uniform_int_distribution<> distribution(-10000, 10000);
+                library->fill(tensor, distribution, i);
+            }
+            break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported data type");
+        }
+    }
+
+    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info,
+                              const GEMMRHSMatrixInfo &rhs_info, DataType data_type, GEMMLowpOutputStageInfo output_stage, const int a_offset, const int b_offset)
+    {
+        // Create tensors
+        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1, QuantizationInfo(1.0f / 255, a_offset));
+        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1, QuantizationInfo(1.0f / 255, b_offset));
+        TensorType bias = create_tensor<TensorType>(bias_shape, DataType::S32, 1);
+        TensorType dst;
+        TensorType rhs_reshaped;
+
+        const unsigned int M = lhs_shape[1];
+        const unsigned int N = rhs_shape[0];
+        const unsigned int K = lhs_shape[0];
+
+        // Tensors for precomputing sum of lhs rows / rhs columns
+        TensorType vec_sum_rows = create_tensor<TensorType>(TensorShape(M, 1, lhs_shape[2]), DataType::S32, 1);
+        TensorType vec_sum_cols = create_tensor<TensorType>(TensorShape(N, 1, rhs_shape[2]), DataType::S32, 1);
+
+        GEMMKernelInfo gemm_info;
+        gemm_info.m            = M;
+        gemm_info.n            = N;
+        gemm_info.k            = K;
+        gemm_info.lhs_info     = lhs_info;
+        gemm_info.rhs_info     = rhs_info;
+        gemm_info.output_stage = output_stage;
+        gemm_info.a_offset     = a_offset;
+        gemm_info.b_offset     = b_offset;
+        // The output tensor will be auto-initialized within the function
+
+        // Create and configure function
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMFunctionType       gemm;
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+
+        // If GEMM is not validated, do not try to run. The validation will check
+        // if the technology supports this extension. If not, the test will be skipped.
+        // If it supports, the test will fail anyway because target and reference
+        // will not match.
+        gemm_validated = bool(gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, vec_sum_cols.info(), vec_sum_rows.info(), bias.info()));
+        if(gemm_validated == true)
+        {
+            gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, vec_sum_cols.info(), vec_sum_rows.info(), bias.info());
+
+            ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+
+            // Allocate tensors
+            lhs.allocator()->allocate();
+            rhs.allocator()->allocate();
+            rhs_reshaped.allocator()->allocate();
+            bias.allocator()->allocate();
+            vec_sum_cols.allocator()->allocate();
+            vec_sum_rows.allocator()->allocate();
+            dst.allocator()->allocate();
+
+            ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!vec_sum_cols.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!vec_sum_rows.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+            // Fill tensors
+            fill(AccessorType(lhs), 0);
+            fill(AccessorType(rhs), 1);
+            fill(AccessorType(bias), 2);
+
+            TensorType    lhs_32 = create_tensor<TensorType>(lhs_shape, DataType::S32, 1);
+            TensorType    rhs_32 = create_tensor<TensorType>(rhs_shape, DataType::S32, 1);
+            CastOperation cast_lhs;
+            CastOperation cast_rhs;
+            cast_lhs.configure(&lhs, &lhs_32, ConvertPolicy::SATURATE);
+            cast_rhs.configure(&rhs, &rhs_32, ConvertPolicy::SATURATE);
+            lhs_32.allocator()->allocate();
+            rhs_32.allocator()->allocate();
+            cast_lhs.run();
+            cast_rhs.run();
+
+            ReduceOperation lhs_sum_rows;
+            ReduceOperation rhs_sum_cols;
+
+            lhs_sum_rows.configure(&lhs_32, &vec_sum_rows, 0, ReductionOperation::SUM, false);
+            rhs_sum_cols.configure(&rhs_32, &vec_sum_cols, 1, ReductionOperation::SUM);
+
+            lhs_sum_rows.run();
+            rhs_sum_cols.run();
+
+            // Compute GEMM
+            ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+            reshape_rhs.run(reshape_rhs_pack);
+            ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_SRC_2, &bias }, { ACL_DST, &dst }, { ACL_VEC_COL_SUM, &vec_sum_cols }, { ACL_VEC_ROW_SUM, &vec_sum_rows } });
+            gemm.run(gemm_pack);
+        }
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, DataType data_type, GEMMLowpOutputStageInfo output_stage,
+                                      const int a_offset, const int b_offset)
+    {
+        TensorShape dst_shape = lhs_shape;
+        dst_shape[0]          = rhs_shape[0];
+        dst_shape[1]          = lhs_shape[1];
+
+        // Create reference
+        SimpleTensor<T>       lhs{ lhs_shape, data_type, 1, QuantizationInfo(1.0f / 255, a_offset) };
+        SimpleTensor<T>       rhs{ rhs_shape, data_type, 1, QuantizationInfo(1.0f / 255, b_offset) };
+        SimpleTensor<int32_t> bias{ bias_shape, DataType::S32, 1 };
+        SimpleTensor<int32_t> dst{ dst_shape, DataType::S32, 1 };
+        SimpleTensor<T>       dst_final{ dst_shape, data_type, 1 };
+
+        // Fill reference
+        fill(lhs, 0);
+        fill(rhs, 1);
+        fill(bias, 2);
+
+        dst       = reference::gemmlowp_matrix_multiply_core<int32_t, T>(lhs, rhs, dst_shape, a_offset, b_offset);
+        dst_final = reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, T>(dst, bias,
+                                                                                      output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_offset, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
+        return dst_final;
+    }
+
+    bool            gemm_validated = true;
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename ReshapeRHSOperatorType, typename GEMMFunctionType>
+class GEMMLowpMatrixMultiplyReshapedOnlyRHSMMULValidationFixture : public framework::Fixture
+{
+public:
+    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0,
+               unsigned int k0, unsigned int h0, bool interleave_rhs, bool transpose_rhs, DataType data_type)
+    {
+        GEMMLHSMatrixInfo lhs_info;
+        lhs_info.m0 = m0;
+        lhs_info.k0 = k0;
+
+        GEMMRHSMatrixInfo rhs_info;
+        rhs_info.n0         = n0;
+        rhs_info.k0         = k0;
+        rhs_info.h0         = h0;
+        rhs_info.interleave = interleave_rhs;
+        rhs_info.transpose  = transpose_rhs;
+
+        // Set the tensor shapes for LHS and RHS matrices
+        const TensorShape lhs_shape(k, m, batch_size);
+        const TensorShape rhs_shape(n, k, batch_size);
+
+        _target = compute_target(lhs_shape, rhs_shape, lhs_info, rhs_info, data_type);
+        if(gemm_validated == true)
+        {
+            _reference = compute_reference(lhs_shape, rhs_shape, data_type);
+        }
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::QASYMM8:
+            {
+                // Between 1 and 254 in order to avoid having -128 and 128 for the DOT product path
+                std::uniform_int_distribution<> distribution(1, 254);
+                library->fill(tensor, distribution, i);
+            }
+            break;
+            case DataType::QASYMM8_SIGNED:
+            {
+                std::uniform_int_distribution<> distribution(-127, 126);
+                library->fill(tensor, distribution, i);
+            }
+            break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported data type");
+        }
+    }
+
+    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const GEMMLHSMatrixInfo &lhs_info,
+                              const GEMMRHSMatrixInfo &rhs_info, DataType data_type)
+    {
+        // Create tensors
+        TensorType lhs = create_tensor<TensorType>(lhs_shape, data_type, 1);
+        TensorType rhs = create_tensor<TensorType>(rhs_shape, data_type, 1);
+        TensorType rhs_reshaped;
+        TensorType dst;
+
+        const unsigned int M = lhs_shape[1];
+        const unsigned int N = rhs_shape[0];
+        const unsigned int K = lhs_shape[0];
+
+        GEMMKernelInfo gemm_info;
+        gemm_info.m        = M;
+        gemm_info.n        = N;
+        gemm_info.k        = K;
+        gemm_info.lhs_info = lhs_info;
+        gemm_info.rhs_info = rhs_info;
+        // The output tensor will be auto-initialized within the function
+
+        // Create and configure function
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMFunctionType       gemm;
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+
+        // If GEMM is not validated, do not try to run. The validation will check
+        // if the technology supports this extension. If not, the test will be skipped.
+        // If it supports, the test will fail anyway because target and reference
+        // will not match.
+        gemm_validated = bool(gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, nullptr, nullptr, nullptr));
+        if(gemm_validated == true)
+        {
+            gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, nullptr, nullptr, nullptr);
+
+            ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+
+            // Allocate tensors
+            lhs.allocator()->allocate();
+            rhs.allocator()->allocate();
+            rhs_reshaped.allocator()->allocate();
+            dst.allocator()->allocate();
+
+            ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+            // Fill tensors
+            fill(AccessorType(lhs), 0);
+            fill(AccessorType(rhs), 1);
+
+            // Compute GEMM
+            ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+            reshape_rhs.run(reshape_rhs_pack);
+            ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+            gemm.run(gemm_pack);
+        }
+
+        return dst;
+    }
+
+    SimpleTensor<int32_t> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type)
+    {
+        TensorShape dst_shape = lhs_shape;
+        dst_shape[0]          = rhs_shape[0];
+        dst_shape[1]          = lhs_shape[1];
+
+        if(data_type == DataType::QASYMM8)
+        {
+            // Create reference
+            SimpleTensor<uint8_t> lhs{ lhs_shape, data_type, 1 };
+            SimpleTensor<uint8_t> rhs{ rhs_shape, data_type, 1 };
+            SimpleTensor<int32_t> dst{ dst_shape, DataType::S32, 1 };
+
+            // Fill reference
+            fill(lhs, 0);
+            fill(rhs, 1);
+
+            return reference::gemmlowp_matrix_multiply_core<int32_t, uint8_t>(lhs, rhs, dst_shape, 0, 0);
+        }
+        else
+        {
+            // Create reference
+            SimpleTensor<int8_t>  lhs{ lhs_shape, data_type, 1 };
+            SimpleTensor<int8_t>  rhs{ rhs_shape, data_type, 1 };
+            SimpleTensor<int32_t> dst{ dst_shape, DataType::S32, 1 };
+
+            // Fill reference
+            fill(lhs, 0);
+            fill(rhs, 1);
+
+            return reference::gemmlowp_matrix_multiply_core<int32_t, int8_t>(lhs, rhs, dst_shape, 0, 0);
+        }
+    }
+
+    bool                  gemm_validated = true;
+    TensorType            _target{};
+    SimpleTensor<int32_t> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename ReshapeRHSOperatorType, typename GEMMFunctionType>
 class GEMMLowpMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0,
                unsigned int k0, unsigned int h0, bool interleave_rhs, bool transpose_rhs, DataType data_type)
     {
@@ -1382,13 +2008,15 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        ReshapeRHSFunctionType reshape_rhs;
+        ReshapeRHSOperatorType reshape_rhs;
         GEMMFunctionType       gemm;
-        reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
-        gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info);
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+        add_padding_x({ &lhs, &rhs, &rhs_reshaped, &dst });
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -1396,18 +2024,20 @@ protected:
         rhs_reshaped.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
         fill(AccessorType(rhs), 1);
 
         // Compute GEMM
-        reshape_rhs.run();
-        gemm.run();
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1454,7 +2084,6 @@ template <typename TensorType, typename AccessorType, typename GEMMFunctionType>
 class GEMMLowpMatrixMultiplyNativeValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0)
     {
         GEMMLHSMatrixInfo lhs_info;
@@ -1497,26 +2126,29 @@ protected:
 
         // Create and configure function
         GEMMFunctionType gemm;
-        gemm.configure(&lhs, &rhs, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K));
+        gemm.configure(lhs.info(), rhs.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K));
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+
+        add_padding_x({ &lhs, &rhs, &dst });
 
         // Allocate tensors
         lhs.allocator()->allocate();
         rhs.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
         fill(AccessorType(rhs), 1);
 
         // Compute GEMM
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1546,7 +2178,6 @@ template <typename TensorType, typename AccessorType, typename GEMMFunctionType>
 class GEMMLowpMatrixMultiplyNative3DValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0)
     {
         GEMMLHSMatrixInfo lhs_info;
@@ -1592,26 +2223,29 @@ protected:
 
         // Create and configure function
         GEMMFunctionType gemm;
-        gemm.configure(&lhs, &rhs, &dst, lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h));
+        gemm.configure(lhs.info(), rhs.info(), dst.info(), lhs_info, rhs_info, GEMMReshapeInfo(M, N, K, 1, 1, m_h));
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
 
-        ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+        add_padding_x({ &lhs, &rhs, &dst });
 
         // Allocate tensors
         lhs.allocator()->allocate();
         rhs.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(lhs), 0);
         fill(AccessorType(rhs), 1);
 
         // Compute GEMM
-        gemm.run();
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_DST, &dst } });
+        gemm.run(gemm_pack);
 
         return dst;
     }
@@ -1641,4 +2275,4 @@ protected:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_GEMMLOWPFIXTURE_H
diff --git a/tests/validation/fixtures/GEMMReshapeLHSMatrixFixture.h b/tests/validation/fixtures/GEMMReshapeLHSMatrixFixture.h
index d0855093a7..d88029f93e 100644
--- a/tests/validation/fixtures/GEMMReshapeLHSMatrixFixture.h
+++ b/tests/validation/fixtures/GEMMReshapeLHSMatrixFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,11 +46,10 @@ namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool reinterpret_input_as_3d = false>
+template <typename TensorType, typename AccessorType, typename OperatorType, typename T, bool reinterpret_input_as_3d = false>
 class GEMMReshapeLHSMatrixValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape_in, unsigned int batch_size, DataType data_type, unsigned int m0, unsigned int k0, unsigned int v0, bool interleave, bool transpose)
     {
         GEMMLHSMatrixInfo lhs_info;
@@ -86,23 +85,26 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        FunctionType gemm_lhs_reshape;
-        gemm_lhs_reshape.configure(&src, &dst, lhs_info, reinterpret_input_as_3d);
+        OperatorType gemm_lhs_reshape;
+        gemm_lhs_reshape.configure(src.info(), dst.info(), lhs_info, reinterpret_input_as_3d);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+
+        add_padding_x({ &src, &dst });
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
 
         // Compute GEMM LHS matrix reshape function
-        gemm_lhs_reshape.run();
+        ITensorPack tensors = { { ACL_SRC, &src }, { ACL_DST, &dst } };
+        gemm_lhs_reshape.run(tensors);
 
         return dst;
     }
diff --git a/tests/validation/fixtures/GEMMReshapeRHSMatrixFixture.h b/tests/validation/fixtures/GEMMReshapeRHSMatrixFixture.h
index 99bfa3bced..0929faf04a 100644
--- a/tests/validation/fixtures/GEMMReshapeRHSMatrixFixture.h
+++ b/tests/validation/fixtures/GEMMReshapeRHSMatrixFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,11 +46,10 @@ namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename OperatorType, typename T>
 class GEMMReshapeRHSMatrixValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape_in, unsigned int batch_size, DataType data_type, unsigned int n0, unsigned int k0, unsigned int h0, bool interleave, bool transpose)
     {
         GEMMRHSMatrixInfo rhs_info;
@@ -85,23 +84,26 @@ protected:
         // The output tensor will be auto-initialized within the function
 
         // Create and configure function
-        FunctionType gemm_rhs_reshape;
-        gemm_rhs_reshape.configure(&src, &dst, rhs_info);
+        OperatorType gemm_rhs_reshape;
+        gemm_rhs_reshape.configure(src.info(), dst.info(), rhs_info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+
+        add_padding_x({ &src, &dst });
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
 
         // Compute GEMM RHS matrix reshape function
-        gemm_rhs_reshape.run();
+        ITensorPack tensors = { { ACL_SRC, &src }, { ACL_DST, &dst } };
+        gemm_rhs_reshape.run(tensors);
 
         return dst;
     }
diff --git a/tests/validation/fixtures/GEMMTranspose1xWFixture.h b/tests/validation/fixtures/GEMMTranspose1xWFixture.h
index 2d2e70697a..3765515b57 100644
--- a/tests/validation/fixtures/GEMMTranspose1xWFixture.h
+++ b/tests/validation/fixtures/GEMMTranspose1xWFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class GEMMTranspose1xWValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(size_t x, size_t y, DataType data_type)
     {
         _data_type = data_type;
@@ -89,24 +88,25 @@ protected:
 
         // Create and configure function
         FunctionType f;
-        f.configure(&a, &b);
+        f.configure(a.info(), b.info());
 
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b.info()->is_resizable());
 
         // Allocate tensors
         a.allocator()->allocate();
         b.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(a), 0);
         fill(AccessorType(b), 1);
 
         // Compute GEMM function
-        f.run();
+        ITensorPack tensors{ { ACL_SRC, &a }, { ACL_DST, &b } };
+        f.run(tensors);
 
         return b;
     }
diff --git a/tests/validation/fixtures/GatherFixture.h b/tests/validation/fixtures/GatherFixture.h
index 0a9f8c1d15..857b0387b7 100644
--- a/tests/validation/fixtures/GatherFixture.h
+++ b/tests/validation/fixtures/GatherFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class GatherFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape indices_shape, int axis, DataType data_type)
     {
         _target    = compute_target(input_shape, data_type, axis, indices_shape);
@@ -67,9 +66,11 @@ protected:
         std::mt19937 gen(library->seed());
         uint32_t    *indices_ptr = static_cast<uint32_t *>(indices.data());
 
-        std::uniform_int_distribution<uint32_t> dist_index(0, input_shape[actual_axis] - 1);
-        //Let's consider 1D indices
-        for(unsigned int ind = 0; ind < indices_shape[0]; ind++)
+        // 10% of the time the index is out-of-range.
+        uint32_t max_index = input_shape[actual_axis] + input_shape[actual_axis] / 9 + 1;
+        std::uniform_int_distribution<uint32_t> dist_index(0, max_index - 1);
+
+        for(unsigned int ind = 0; ind < indices_shape.total_size(); ind++)
         {
             indices_ptr[ind] = dist_index(gen);
         }
@@ -91,18 +92,18 @@ protected:
         FunctionType gather;
         gather.configure(&src, &indices_tensor, &dst, axis);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(indices_tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(indices_tensor.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         indices_tensor.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!indices_tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!indices_tensor.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/Im2ColFixture.h b/tests/validation/fixtures/Im2ColFixture.h
index b6cf18bd4c..5c7978f4ab 100644
--- a/tests/validation/fixtures/Im2ColFixture.h
+++ b/tests/validation/fixtures/Im2ColFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,10 +45,9 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool batch_size_on_z>
-class Im2ColValidationFixture : public framework::Fixture
+class Im2ColOpValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, DataType data_type, const Size2D &kernel_dims, const PadStrideInfo &conv_info, const QuantizationInfo &quant_info, const DataLayout &data_layout,
                unsigned int num_groups)
     {
@@ -88,23 +87,28 @@ protected:
 
         // Create and configure function
         FunctionType im2col_func;
-        im2col_func.configure(&src, &dst, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), _num_groups);
+        im2col_func.configure(src.info(), dst.info(), _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), _num_groups);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
 
+        arm_compute::ITensorPack pack =
+        {
+            { arm_compute::TensorType::ACL_SRC, &src },
+            { arm_compute::TensorType::ACL_DST, &dst }
+        };
         // Compute function
-        im2col_func.run();
+        im2col_func.run(pack);
 
         return dst;
     }
diff --git a/tests/validation/fixtures/IndirectConv2dAddressPrecalculationFixture.h b/tests/validation/fixtures/IndirectConv2dAddressPrecalculationFixture.h
new file mode 100644
index 0000000000..7374093f51
--- /dev/null
+++ b/tests/validation/fixtures/IndirectConv2dAddressPrecalculationFixture.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_FIXTURE
+#define ARM_COMPUTE_TEST_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/Globals.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/IndirectConv2dAddressPrecalculation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using namespace arm_compute::misc::shape_calculator;
+
+template <typename TensorType, typename AccessorType, typename OperatorType>
+class IndirectConv2dAddressPrecalculationValidationFixture : public framework::Fixture
+{
+public:
+    void setup(unsigned int src_w,
+               unsigned int src_h,
+               unsigned int src_b,
+               unsigned int wei_w,
+               unsigned int wei_h,
+               unsigned int pad,
+               unsigned int stride,
+               unsigned int m0)
+    {
+        DirectConvComputeKernelInfo desc;
+        desc.m0                         = m0;
+        desc.n0                         = 1;     // Not used by the kernel
+        desc.k0                         = 1;     // Not used by the kernel
+        desc.export_weights_to_cl_image = false; // Not used by the kernel
+
+        const PadStrideInfo conv_info(stride, stride, pad, pad);
+
+        const TensorShape shape_conv_src(23, // The input channels are not used by the kernel
+                                         src_w,
+                                         src_h,
+                                         src_b);
+
+        const TensorShape shape_conv_wei(23, // The input channels are not used by the kernel
+                                         wei_w,
+                                         wei_h,
+                                         23 // The output channels are not used by the kernel
+                                        );
+
+        // The result of the kernel does not change with the datatype. Hence, we can fix it to Fp16 for validation purposes
+        const DataType data_type = DataType::F16;
+
+        _target    = compute_target(shape_conv_src, shape_conv_wei, data_type, conv_info, desc);
+        _reference = compute_reference(shape_conv_src, shape_conv_wei, data_type, conv_info, desc);
+    }
+
+protected:
+    TensorType compute_target(TensorShape shape_conv_src, TensorShape shape_conv_wei, DataType data_type, const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+    {
+        TensorInfo src_conv_info(shape_conv_src, 1, data_type, DataLayout::NHWC);
+        TensorInfo wei_conv_info(shape_conv_wei, 1, data_type, DataLayout::NHWC);
+        TensorType dst;
+
+        // The output tensor will be auto-initialized within the function
+
+        // Create and configure function
+        OperatorType func;
+        func.configure(&src_conv_info, &wei_conv_info, dst.info(), conv_info, desc);
+
+        add_padding_x({ &dst });
+
+        // Allocate tensors
+        dst.allocator()->allocate();
+
+        // Compute GEMM LHS matrix reshape function
+        ITensorPack tensors = { { ACL_DST, &dst } };
+        func.run(tensors);
+
+        return dst;
+    }
+
+    SimpleTensor<int32_t> compute_reference(TensorShape shape_conv_src, TensorShape shape_conv_wei, DataType data_type, const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+    {
+        ARM_COMPUTE_UNUSED(data_type);
+        TensorShape shape_out         = compute_indirect_buffer_shape(shape_conv_src, DataLayout::NHWC, shape_conv_wei, conv_info, desc);
+        TensorShape output_conv_shape = compute_deep_convolution_shape(shape_conv_src, DataLayout::NHWC, shape_conv_wei, conv_info);
+
+        return reference::indirect_conv2d_addr_precalculation(shape_conv_src, shape_conv_wei, output_conv_shape, shape_out, conv_info);
+    }
+
+    TensorType            _target{};
+    SimpleTensor<int32_t> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_FIXTURE */
+\ No newline at end of file
diff --git a/tests/validation/fixtures/InstanceNormalizationLayerFixture.h b/tests/validation/fixtures/InstanceNormalizationLayerFixture.h
index 611d9aae5b..c26dd99f02 100644
--- a/tests/validation/fixtures/InstanceNormalizationLayerFixture.h
+++ b/tests/validation/fixtures/InstanceNormalizationLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class InstanceNormalizationLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, DataLayout data_layout, bool in_place)
     {
         _target    = compute_target(shape, data_type, data_layout, in_place);
@@ -86,10 +85,10 @@ protected:
         FunctionType instance_norm_func;
         instance_norm_func.configure(&src, in_place ? nullptr : &dst, gamma, beta, epsilon);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
         if(!in_place)
         {
-            ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
         }
 
         // Allocate tensors
@@ -99,10 +98,10 @@ protected:
             dst.allocator()->allocate();
         }
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
         if(!in_place)
         {
-            ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
         }
 
         // Fill tensors
diff --git a/tests/validation/fixtures/L2NormalizeLayerFixture.h b/tests/validation/fixtures/L2NormalizeLayerFixture.h
index 349c0904eb..b8f4b1eaf3 100644
--- a/tests/validation/fixtures/L2NormalizeLayerFixture.h
+++ b/tests/validation/fixtures/L2NormalizeLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,7 +48,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class L2NormalizeLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, DataLayout data_layout, int axis, float epsilon)
     {
         _target    = compute_target(shape, data_type, data_layout, axis, epsilon);
@@ -81,15 +80,15 @@ protected:
         FunctionType l2_norm_func;
         l2_norm_func.configure(&src, &dst, axis, epsilon);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/LSTMLayerFixture.h b/tests/validation/fixtures/LSTMLayerFixture.h
index 366d050039..a32e9adfe5 100644
--- a/tests/validation/fixtures/LSTMLayerFixture.h
+++ b/tests/validation/fixtures/LSTMLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class LSTMLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape input_weights_shape, TensorShape recurrent_weights_shape, TensorShape cell_bias_shape, TensorShape output_cell_shape, TensorShape output_shape,
                TensorShape scratch_shape, ActivationLayerInfo info, float cell_threshold, float projection_threshold, DataType data_type, bool projection_opt, bool peephole_opt,
                bool use_layer_norm)
@@ -167,22 +166,22 @@ protected:
                        &scratch, &output_state_out, &cell_state_out, &output,
                        lstm_params, info, cell_threshold, projection_threshold);
 
-        ARM_COMPUTE_EXPECT(input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(input_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(input_to_cell_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(input_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(recurrent_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(recurrent_to_cell_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(recurrent_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(forget_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(cell_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(output_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(output_state_in.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(cell_state_in.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(scratch.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(output_state_out.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(cell_state_out.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(input_to_forget_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(input_to_cell_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(input_to_output_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(recurrent_to_forget_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(recurrent_to_cell_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(recurrent_to_output_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(forget_gate_bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(cell_bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(output_gate_bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(output_state_in.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(cell_state_in.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(scratch.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(output_state_out.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(cell_state_out.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(output.info()->is_resizable());
 
         // Allocate tensors
         input.allocator()->allocate();
@@ -202,22 +201,22 @@ protected:
         cell_state_out.allocator()->allocate();
         output.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!input_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!input_to_cell_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!input_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!recurrent_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!recurrent_to_cell_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!recurrent_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!forget_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!cell_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!output_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!output_state_in.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!cell_state_in.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!scratch.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!output_state_out.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!cell_state_out.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!input_to_forget_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!input_to_cell_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!input_to_output_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!recurrent_to_forget_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!recurrent_to_cell_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!recurrent_to_output_w.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!forget_gate_bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!cell_bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!output_gate_bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!output_state_in.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!cell_state_in.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!scratch.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!output_state_out.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!cell_state_out.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!output.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(input), 0);
@@ -236,18 +235,18 @@ protected:
 
         if(!cifg_opt)
         {
-            ARM_COMPUTE_EXPECT(input_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(recurrent_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(cell_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(input_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(input_to_input_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(recurrent_to_input_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(cell_to_input_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(input_gate_bias.info()->is_resizable());
             input_to_input_w.allocator()->allocate();
             recurrent_to_input_w.allocator()->allocate();
             cell_to_input_w.allocator()->allocate();
             input_gate_bias.allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!input_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(!recurrent_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(!cell_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(!input_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!input_to_input_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!recurrent_to_input_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!cell_to_input_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!input_gate_bias.info()->is_resizable());
             fill(AccessorType(input_to_input_w), 13);
             fill(AccessorType(recurrent_to_input_w), 14);
             if(peephole_opt)
@@ -260,26 +259,26 @@ protected:
 
         if(peephole_opt)
         {
-            ARM_COMPUTE_EXPECT(cell_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(cell_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(cell_to_forget_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(cell_to_output_w.info()->is_resizable());
             cell_to_forget_w.allocator()->allocate();
             cell_to_output_w.allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!cell_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(!cell_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!cell_to_forget_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!cell_to_output_w.info()->is_resizable());
             fill(AccessorType(cell_to_forget_w), 18);
             fill(AccessorType(cell_to_output_w), 19);
         }
 
         if(projection_opt)
         {
-            ARM_COMPUTE_EXPECT(projection_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(projection_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(projection_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(projection_bias.info()->is_resizable());
 
             projection_w.allocator()->allocate();
             projection_bias.allocator()->allocate();
 
-            ARM_COMPUTE_EXPECT(!projection_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(!projection_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!projection_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!projection_bias.info()->is_resizable());
 
             fill(AccessorType(projection_w), 20);
             fill(AccessorType(projection_bias), 21);
@@ -289,25 +288,25 @@ protected:
         {
             if(!cifg_opt)
             {
-                ARM_COMPUTE_EXPECT(input_layer_norm_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+                ARM_COMPUTE_ASSERT(input_layer_norm_w.info()->is_resizable());
 
                 input_layer_norm_w.allocator()->allocate();
 
-                ARM_COMPUTE_EXPECT(!input_layer_norm_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+                ARM_COMPUTE_ASSERT(!input_layer_norm_w.info()->is_resizable());
 
                 fill(AccessorType(input_layer_norm_w), 22);
             }
-            ARM_COMPUTE_EXPECT(forget_layer_norm_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(cell_layer_norm_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(output_layer_norm_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(forget_layer_norm_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(cell_layer_norm_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(output_layer_norm_w.info()->is_resizable());
 
             forget_layer_norm_w.allocator()->allocate();
             cell_layer_norm_w.allocator()->allocate();
             output_layer_norm_w.allocator()->allocate();
 
-            ARM_COMPUTE_EXPECT(!forget_layer_norm_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(!cell_layer_norm_w.info()->is_resizable(), framework::LogLevel::ERRORS);
-            ARM_COMPUTE_EXPECT(!output_layer_norm_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!forget_layer_norm_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!cell_layer_norm_w.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!output_layer_norm_w.info()->is_resizable());
 
             fill(AccessorType(forget_layer_norm_w), 23);
             fill(AccessorType(cell_layer_norm_w), 24);
@@ -458,7 +457,6 @@ protected:
             }
             input_gate = reference::activation_layer(input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         }
-
         // Compute cell_state
         SimpleTensor<T> fully_connected_cell_state = reference::fully_connected_layer(input, input_to_cell_w, cell_bias, output_cell_shape);
         transposed_weights                         = reference::transpose(recurrent_to_cell_w);
@@ -474,12 +472,13 @@ protected:
             fill(cell_bias, 8);
             cell_state_out = reference::arithmetic_operation(reference::ArithmeticOperation::ADD, cell_state_out, cell_bias, data_type, ConvertPolicy::SATURATE);
         }
-        cell_state_out = reference::activation_layer(cell_state_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        cell_state_out = reference::activation_layer(cell_state_out, info);
         cell_state_out = reference::pixel_wise_multiplication<T, T, T>(cell_state_out, input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN, data_type);
         cell_state_out = reference::arithmetic_operation(reference::ArithmeticOperation::ADD, cell_state_out, pixelwise_mul, data_type, ConvertPolicy::SATURATE);
+
         if(cell_threshold != 0.f)
         {
-            cell_state_out = reference::activation_layer(cell_state_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+            cell_state_out = reference::activation_layer(cell_state_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold));
         }
 
         // Compute output
@@ -515,7 +514,6 @@ protected:
                 output_state_out = reference::activation_layer(fully_connected_projection, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
             }
         }
-
         std::vector<SimpleTensor<T>> scratch_inputs;
         if(!cifg_opt)
         {
diff --git a/tests/validation/fixtures/LogicalFixture.h b/tests/validation/fixtures/LogicalFixture.h
index 9f64d89d10..60dc963ba7 100644
--- a/tests/validation/fixtures/LogicalFixture.h
+++ b/tests/validation/fixtures/LogicalFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,9 +60,9 @@ protected:
     {
         for(auto t : tensors)
         {
-            ARM_COMPUTE_EXPECT(t->info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(t->info()->is_resizable());
             t->allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!t->info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!t->info()->is_resizable());
         }
     }
 
@@ -79,7 +79,6 @@ class LogicalBinaryOperationValidationFixture : public LogicalOperationValidatio
     using Parent = LogicalOperationValidationFixtureBase<TensorType, AccessorType, FunctionType, T>;
 
 public:
-    template <typename...>
     void setup(TensorShape shape0, TensorShape shape1)
     {
         Parent::_target    = compute_target(shape0, shape1);
@@ -135,7 +134,6 @@ class LogicalNotValidationFixture : public LogicalOperationValidationFixtureBase
     using Parent = LogicalOperationValidationFixtureBase<TensorType, AccessorType, FunctionType, T>;
 
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
         Parent::_target    = compute_target(shape, data_type);
diff --git a/tests/validation/fixtures/MatMulFixture.h b/tests/validation/fixtures/MatMulFixture.h
new file mode 100644
index 0000000000..ffd12e56d0
--- /dev/null
+++ b/tests/validation/fixtures/MatMulFixture.h
@@ -0,0 +1,612 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "src/core/utils/quantization/AsymmHelpers.h"
+#include "tests/framework/Asserts.h" // Required for ARM_COMPUTE_ASSERT
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/GEMMLowp.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/ReshapeLayer.h"
+#include "tests/validation/Validation.h"
+
+#include <limits>
+#include <random>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class MatMulGenericValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape         shape_a,
+               TensorShape         shape_b,
+               TensorShape         output_shape,
+               bool                transpose_a,
+               bool                transpose_b,
+               DataType            data_type,
+               ActivationLayerInfo act_info,
+               int                 num_extra_runs,
+               Settings            settings,
+               QuantizationInfo    a_qinfo = QuantizationInfo(),
+               QuantizationInfo    b_qinfo = QuantizationInfo(),
+               QuantizationInfo    o_qinfo = QuantizationInfo())
+    {
+        // For brevity, the input shapes are assumed to be not-transposed for both a and b matrices.
+        if (transpose_a)
+        {
+            permute(shape_a, PermutationVector(1U, 0U));
+        }
+        if (transpose_b)
+        {
+            permute(shape_b, PermutationVector(1U, 0U));
+        }
+
+        _target    = compute_target(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info,
+                                    num_extra_runs, settings, a_qinfo, b_qinfo, o_qinfo);
+        _reference = compute_reference(shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info,
+                                       a_qinfo, b_qinfo, o_qinfo);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, float lo = -1.f, float hi = 1.f)
+    {
+        switch (tensor.data_type())
+        {
+            case DataType::BFLOAT16:
+            {
+                arm_compute::utils::uniform_real_distribution_16bit<bfloat16> distribution{float(lo), float(hi)};
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F16:
+            {
+                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{float(lo), float(hi)};
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<float> distribution(lo, hi);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            {
+                library->fill_tensor_uniform(tensor, i);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported data type.");
+            }
+        }
+    }
+
+    virtual TensorType compute_target(const TensorShape  &shape_a,
+                                      const TensorShape  &shape_b,
+                                      const TensorShape  &output_shape,
+                                      bool                transpose_a,
+                                      bool                transpose_b,
+                                      DataType            data_type,
+                                      ActivationLayerInfo act_info,
+                                      int                 num_extra_runs,
+                                      const Settings     &settings,
+                                      QuantizationInfo    a_qinfo,
+                                      QuantizationInfo    b_qinfo,
+                                      QuantizationInfo    o_qinfo)
+    {
+        // 1. Create Classes and configure function
+        // ----------------------------------------------------
+        // Create tensors
+        // Configure relevant classes and matmul function
+        TensorType a   = create_tensor<TensorType>(shape_a, data_type, 1, a_qinfo);
+        TensorType b   = create_tensor<TensorType>(shape_b, data_type, 1, b_qinfo);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, o_qinfo);
+
+        FunctionType matmul;
+
+        // Configure MatMulInfo class
+        MatMulInfo mm_info;
+        mm_info.adj_lhs(transpose_a).adj_rhs(transpose_b);
+
+        // Ensure values are dynamic
+        a.info()->set_are_values_constant(false);
+        b.info()->set_are_values_constant(false);
+
+        // Configure operator
+        matmul.configure(&a, &b, &dst, mm_info, settings, act_info);
+
+        // Assertions
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // For multiple runs.
+        for (int i = 0; i < num_extra_runs; i++)
+        {
+            // Stress dynamic tensors by running multiple times.
+            // --------------------------------------------------------
+            // Fill tensors with new seed
+            // Run function
+            const int seed_offset = num_extra_runs * 100;
+            fill(AccessorType(a), seed_offset);
+            fill(AccessorType(b), seed_offset + 1);
+
+            matmul.run();
+        }
+
+        // 2. Final Run for reference comparison
+        // --------------------------------------------------------
+        // Re-fill tensors same seed as reference run
+        // Compute MatMul operation
+        fill(AccessorType(a), 2);
+        fill(AccessorType(b), 3);
+
+        matmul.run();
+
+        return dst;
+    }
+
+    template <typename TT>
+    typename std::enable_if < !std::is_integral<TT>::value, SimpleTensor<TT >>::type
+                                                                            compute_reference_gemm(const SimpleTensor<TT> &a,
+                                                                                                   const SimpleTensor<TT> &b,
+                                                                                                   const SimpleTensor<TT> &c,
+                                                                                                   float                   alpha,
+                                                                                                   float                   beta,
+                                                                                                   const QuantizationInfo &o_qinfo)
+    {
+        ARM_COMPUTE_UNUSED(o_qinfo);
+
+        return reference::gemm(a, b, c, alpha, beta);
+    }
+
+    template <typename TT>
+    typename std::enable_if<std::is_integral<TT>::value, SimpleTensor<TT>>::type
+                                                                        compute_reference_gemm(const SimpleTensor<TT> &a,
+                                                                                               const SimpleTensor<TT> &b,
+                                                                                               const SimpleTensor<TT> &c,
+                                                                                               float                   alpha,
+                                                                                               float                   beta,
+                                                                                               const QuantizationInfo &o_qinfo)
+    {
+        ARM_COMPUTE_UNUSED(alpha, beta);
+
+        const auto aq = a.quantization_info().uniform();
+        const auto bq = b.quantization_info().uniform();
+        const auto oq = o_qinfo.uniform();
+
+        const auto multiplier = aq.scale * bq.scale / oq.scale;
+
+        int32_t output_multiplier = 0;
+        int32_t output_shift      = 0;
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+        std::vector<int32_t> output_multipliers{output_multiplier};
+        std::vector<int32_t> output_shifts{output_shift};
+
+        //The lhs and rhs offsets are negated here to keep the reference aligned with the function implementation where the lhs and rhs offsets are also negated.
+        const auto tmp = reference::gemmlowp_matrix_multiply_core<int32_t>(a, b, c.shape(), -aq.offset, -bq.offset);
+
+        auto output = reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, TT>(
+                          tmp, output_multipliers, output_shifts, oq.offset, std::numeric_limits<int32_t>::lowest(),
+                          std::numeric_limits<int32_t>::max());
+        output.quantization_info(o_qinfo);
+
+        return output;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape  &a_shape,
+                                      const TensorShape  &b_shape,
+                                      const TensorShape  &output_shape,
+                                      bool                transpose_a,
+                                      bool                transpose_b,
+                                      DataType            data_type,
+                                      ActivationLayerInfo act_info,
+                                      QuantizationInfo    a_qinfo,
+                                      QuantizationInfo    b_qinfo,
+                                      QuantizationInfo    o_qinfo)
+    {
+        // We collapse dimensions > 2 onto dimension 2, i.e. 4D+ tensors will look like 3D
+        // This is necessary unless we choose to extend gemm reference for 4D+ tensors
+        TensorShape output_shape_collapsed = output_shape.collapsed_from(Window::DimZ);
+        TensorShape a_shape_collapsed      = a_shape.collapsed_from(Window::DimZ);
+        TensorShape b_shape_collapsed      = b_shape.collapsed_from(Window::DimZ);
+
+        // Create reference
+        SimpleTensor<T> a{a_shape_collapsed, data_type, 1, a_qinfo};
+        SimpleTensor<T> b{b_shape_collapsed, data_type, 1, b_qinfo};
+        SimpleTensor<T> c{output_shape_collapsed, data_type, 1};
+
+        // Fill reference
+        fill(a, 2);
+        fill(b, 3);
+
+        /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if transpose_a is set to true, then A is assumed to be (B x K x M),
+        therefore, A must be pre-transposed before passing it to the fixture. And, we transpose A again in the fixture to make it (B x M x K)
+        in order to be able to call reference implementation that works with (B x M x K) input.
+        Similarly, if transpose_b is set to true, then B is assumed to be (B x N x K), B must be pre-transposed before passing it to the fixture. */
+
+        // Define transposed shapes
+        TensorShape a_transposed_shape(a.shape());
+        a_transposed_shape.set(0, a.shape().y());
+        a_transposed_shape.set(1, a.shape().x());
+
+        TensorShape b_transposed_shape(b.shape());
+        b_transposed_shape.set(0, b.shape().y());
+        b_transposed_shape.set(1, b.shape().x());
+
+        // Define transposed tensors
+        SimpleTensor<T> a_transposed{a_transposed_shape, data_type};
+        SimpleTensor<T> b_transposed{b_transposed_shape, data_type};
+
+        // pretranspose a if necessary
+        if (transpose_a)
+        {
+            a_transposed = reference::permute<T>(a, PermutationVector(1U, 0U));
+        }
+        // pretranspose b if necessary
+        if (transpose_b)
+        {
+            b_transposed = reference::permute<T>(b, PermutationVector(1U, 0U));
+        }
+
+        // Setting beta to 0 will effectively disable C for the
+        // computation of the reference: alpha * A * B + 0 * C
+        // Use transposed tensors if boolean enabled else use original tensors
+        auto result = compute_reference_gemm<T>((transpose_a) ? a_transposed : a, (transpose_b) ? b_transposed : b, c,
+                                                1.0f, 0.f, o_qinfo);
+
+        result = reference::activation_layer<T>(result, act_info, o_qinfo);
+
+        // We reshape the gemm output back if the tensor is high dimensional
+        if (output_shape_collapsed != output_shape)
+        {
+            result = reference::reshape_layer(result, output_shape);
+        }
+
+        return result;
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+/// TODO: (ONCPUML-1451) The current state of this fixture is interim and a longer-term testing method will be implemented later.
+/// @note: Currently we support only a 2x2 test due to the lack of reorder ref. implementation.
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class MatMulFixedFormatFixture
+    : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+    TensorType compute_target(const TensorShape  &shape_a,
+                              const TensorShape  &shape_b,
+                              const TensorShape  &output_shape,
+                              bool                transpose_a,
+                              bool                transpose_b,
+                              DataType            data_type,
+                              ActivationLayerInfo act_info,
+                              int                 num_extra_runs,
+                              const Settings     &settings,
+                              QuantizationInfo    a_qinfo,
+                              QuantizationInfo    b_qinfo,
+                              QuantizationInfo    o_qinfo) override
+    {
+        // 1. Create Classes and configure function
+        // ----------------------------------------------------
+        // Create tensors
+        // Configure relevant classes and matmul function
+        TensorType a   = create_tensor<TensorType>(shape_a, data_type, 1, a_qinfo);
+        TensorType b   = create_tensor<TensorType>(shape_b, data_type, 1, b_qinfo);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, o_qinfo);
+
+        const auto       weight_tensor_info  = TensorInfo(*b.info());
+        const TensorInfo new_tensor_info     = prepare_weights(weight_tensor_info);
+        TensorType       weights_transformed = create_tensor<TensorType>(new_tensor_info);
+
+        // Configure MatMulInfo class
+        MatMulInfo mm_info;
+        mm_info.adj_lhs(transpose_a).adj_rhs(transpose_b);
+
+        // Ensure values are dynamic
+        a.info()->set_are_values_constant(false);
+        b.info()->set_are_values_constant(false);
+        weights_transformed.info()->set_are_values_constant(false);
+
+        FunctionType matmul;
+
+        // Configure operator
+        matmul.configure(&a, &weights_transformed, &dst, mm_info, settings, act_info);
+
+        // Assertions
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(weights_transformed.info()->is_resizable());
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        dst.allocator()->allocate();
+        weights_transformed.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!weights_transformed.info()->is_resizable());
+
+        // For multiple runs.
+        for (int i = 0; i < num_extra_runs; i++)
+        {
+            // Stress dynamic tensors by running multiple times.
+            // --------------------------------------------------------
+            // Fill tensors with new seed
+            // Run function
+            const int seed_offset = num_extra_runs * 100;
+            this->fill(AccessorType(a), seed_offset);
+            this->fill(AccessorType(b), seed_offset + 1);
+
+            matmul.run();
+        }
+
+        // 2. Final Run for reference comparison
+        // --------------------------------------------------------
+        // Re-fill tensors same seed as reference run
+        // Compute MatMul operation
+        this->fill(AccessorType(a), 2);
+        this->fill(AccessorType(b), 3);
+
+        rearrange_data(AccessorType(b), AccessorType(weights_transformed));
+
+        matmul.run();
+
+        return dst;
+    }
+
+    void setup(TensorShape         shape_a,
+               TensorShape         shape_b,
+               TensorShape         output_shape,
+               bool                transpose_a,
+               bool                transpose_b,
+               DataType            data_type,
+               ActivationLayerInfo act_info,
+               int                 num_extra_runs,
+               Settings            settings,
+               QuantizationInfo    a_qinfo,
+               QuantizationInfo    b_qinfo,
+               QuantizationInfo    o_qinfo)
+    {
+        if (CPUInfo::get().has_bf16())
+        {
+            MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(
+                shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, settings,
+                a_qinfo, b_qinfo, o_qinfo);
+        }
+    }
+
+private:
+    TensorInfo prepare_weights(const TensorInfo tensor_info)
+    {
+        const DataLayout data_layout = tensor_info.data_layout();
+        ARM_COMPUTE_EXPECT(data_layout == DataLayout::NCHW, framework::LogLevel::ERRORS);
+        const DataType    data_type    = tensor_info.data_type();
+        const TensorShape tensor_shape = tensor_info.tensor_shape();
+        const int         H = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)];
+        const int         W = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)];
+        ARM_COMPUTE_EXPECT(H <= 2 && W <= 2, framework::LogLevel::ERRORS);
+
+        arm_compute::Strides strides_in_bytes = tensor_info.strides_in_bytes();
+        strides_in_bytes.set(1, 32);
+        strides_in_bytes.set(2, 32);
+
+        const size_t offset_first_element_in_bytes = tensor_info.offset_first_element_in_bytes();
+        const size_t total_size_in_bytes           = 32;
+
+        const TensorShape TS(H, W);
+
+        TensorInfo new_tensor_info = tensor_info;
+        new_tensor_info.init(TS, tensor_info.num_channels(), data_type, strides_in_bytes, offset_first_element_in_bytes,
+                             total_size_in_bytes);
+
+        return new_tensor_info;
+    }
+
+    void rearrange_data(const AccessorType src, AccessorType dst)
+    {
+        const TensorShape src_tensor_shape = src.shape();
+        const DataLayout  data_layout      = src.data_layout();
+        ARM_COMPUTE_EXPECT(data_layout == DataLayout::NCHW, framework::LogLevel::ERRORS);
+        const unsigned int O =
+            src_tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)]; // N=O
+        const unsigned int H =
+            src_tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)];
+        const unsigned int W =
+            src_tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)];
+        const unsigned int I =
+            src_tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)]; // C=I
+        ARM_COMPUTE_EXPECT(H <= 2 && W <= 2, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(I == 1 && O == 1, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(src.num_elements() <= dst.num_elements(), framework::LogLevel::ERRORS);
+
+        const T *src_ptr = reinterpret_cast<const T *>(src.data());
+        T       *dst_ptr = reinterpret_cast<T *>(dst.data());
+
+        // rearrange indexes for 2x2 input and weight
+        int dst_idx[] = {0, 4, 1, 5};
+        for (int i = 0; i < 4; i++)
+        {
+            dst_ptr[dst_idx[i]] = src_ptr[i];
+        }
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class MatMulValidationFixture
+    : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+    void setup(TensorShape shape_a,
+               TensorShape shape_b,
+               TensorShape output_shape,
+               bool        transpose_a,
+               bool        transpose_b,
+               DataType    data_type)
+    {
+        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(
+            shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, ActivationLayerInfo(), 0, Settings());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class MatMulValidationWithDynamicTensorsFixture
+    : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+    void setup(TensorShape         shape_a,
+               TensorShape         shape_b,
+               TensorShape         output_shape,
+               bool                transpose_a,
+               bool                transpose_b,
+               DataType            data_type,
+               ActivationLayerInfo act_info,
+               int                 num_extra_runs)
+    {
+        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(
+            shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class QuantizedMatMulValidationFixture
+    : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+    void setup(TensorShape         shape_a,
+               TensorShape         shape_b,
+               TensorShape         output_shape,
+               bool                transpose_a,
+               bool                transpose_b,
+               DataType            data_type,
+               ActivationLayerInfo act_info,
+               int                 num_extra_runs,
+               QuantizationInfo    a_qinfo,
+               QuantizationInfo    b_qinfo,
+               QuantizationInfo    o_qinfo)
+    {
+        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(
+            shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(),
+            a_qinfo, b_qinfo, o_qinfo);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class MatMulValidationWithActivationFixture
+    : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+    void setup(TensorShape         shape_a,
+               TensorShape         shape_b,
+               TensorShape         output_shape,
+               bool                transpose_a,
+               bool                transpose_b,
+               DataType            data_type,
+               ActivationLayerInfo act_info)
+    {
+        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(
+            shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, 0, Settings());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class MatMulValidationWithActivationAlphaBetaFixture
+    : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+    void setup(TensorShape                             shape_a,
+               TensorShape                             shape_b,
+               TensorShape                             output_shape,
+               bool                                    transpose_a,
+               bool                                    transpose_b,
+               DataType                                data_type,
+               ActivationLayerInfo::ActivationFunction function,
+               float                                   alpha_beta)
+    {
+        ActivationLayerInfo act_info(function, alpha_beta, alpha_beta);
+        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(
+            shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, 0, Settings());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Settings, typename T>
+class QuantizedMatMulValidationWithActivationFixture
+    : public MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>
+{
+public:
+    void setup(TensorShape                             shape_a,
+               TensorShape                             shape_b,
+               TensorShape                             output_shape,
+               bool                                    transpose_a,
+               bool                                    transpose_b,
+               DataType                                data_type,
+               ActivationLayerInfo::ActivationFunction function,
+               float                                   alpha_beta,
+               int                                     num_extra_runs,
+               QuantizationInfo                        a_qinfo,
+               QuantizationInfo                        b_qinfo,
+               QuantizationInfo                        o_qinfo)
+    {
+        ActivationLayerInfo act_info(function, alpha_beta, alpha_beta);
+        MatMulGenericValidationFixture<TensorType, AccessorType, FunctionType, Settings, T>::setup(
+            shape_a, shape_b, output_shape, transpose_a, transpose_b, data_type, act_info, num_extra_runs, Settings(),
+            a_qinfo, b_qinfo, o_qinfo);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE_H
diff --git a/tests/validation/fixtures/MatMulKernelFixture.h b/tests/validation/fixtures/MatMulKernelFixture.h
new file mode 100644
index 0000000000..26072dff65
--- /dev/null
+++ b/tests/validation/fixtures/MatMulKernelFixture.h
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_MATMULKERNELFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_MATMULKERNELFIXTURE_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/CL/Helper.h"
+#include "tests/framework/Asserts.h" // Required for ARM_COMPUTE_ASSERT
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/GEMMLowp.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/ReshapeLayer.h"
+#include <cmath>
+#include <random>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using namespace arm_compute::opencl::kernels;
+
+template <typename T, typename KernelType, bool use_mmul = false>
+class MatMulKernelGenericValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool pretranspose_a, bool pretranspose_b, int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type,
+               bool enable_bias)
+    {
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        _hash = M0 + N0 + K0 + shape_a[0] + shape_a[1] + shape_b[0] + shape_b[1] + enable_bias + export_rhs_to_cl_image;
+
+        // Flag to create a bias
+        _enable_bias = enable_bias;
+
+        // For brevity, the input shapes are assumed to be not-transposed for both Lhs and Rhs matrices.
+        QuantizationInfo lhs_q_info;
+        QuantizationInfo rhs_q_info;
+        QuantizationInfo dst_q_info;
+
+        if(is_data_type_quantized(data_type))
+        {
+            const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+            const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+            std::mt19937                           generator(library->seed() + _hash);
+            std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+            std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+            const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+            const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+
+            const int32_t offset_lhs = distribution_t(generator);
+            const int32_t offset_rhs = distribution_t(generator);
+
+            lhs_q_info = QuantizationInfo(scale_lhs, offset_lhs);
+            rhs_q_info = QuantizationInfo(scale_rhs, offset_rhs);
+
+            const int m = shape_a.y();
+            const int n = shape_b.x();
+            const int k = shape_a.x();
+
+            const float bias_fraction = enable_bias ? 0.5f : 0.f;
+
+            QuantizationHint q_hint = suggest_matmul_dst_q_info_and_bias(lhs_q_info, rhs_q_info, m, n, k, data_type, bias_fraction);
+            dst_q_info              = q_hint.q_info;
+            _min_bias               = q_hint.bias_min;
+            _max_bias               = q_hint.bias_max;
+        }
+
+        if(pretranspose_a)
+        {
+            permute(shape_a, PermutationVector(1U, 0U));
+        }
+
+        if(pretranspose_b)
+        {
+            permute(shape_b, PermutationVector(1U, 0U));
+        }
+
+        // Skip configurations unsupported by the device.
+        _device_supports_export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device());
+        if(!_device_supports_export_to_cl_image && export_rhs_to_cl_image)
+        {
+            ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+            framework::ARM_COMPUTE_PRINT_INFO();
+            return; // Note: Also need to skip the validate in corresponding FIXTURE_DATA_TEST_CASEs.
+        }
+
+        _device_supports_mmul = arm_matrix_multiply_supported(CLKernelLibrary::get().get_device());
+        if(!_device_supports_mmul && use_mmul)
+        {
+            ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+            framework::ARM_COMPUTE_PRINT_INFO();
+            return; // Note: Also need to skip the validate in corresponding FIXTURE_DATA_TEST_CASEs.
+        }
+
+        _target    = compute_target(shape_a, shape_b, output_shape, pretranspose_a, pretranspose_b, M0, N0, K0, export_rhs_to_cl_image, data_type, lhs_q_info, rhs_q_info, dst_q_info);
+        _reference = compute_reference(shape_a, shape_b, output_shape, pretranspose_a, pretranspose_b, data_type, lhs_q_info, rhs_q_info, dst_q_info);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, float lo = -1.f, float hi = 1.f)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F16:
+            {
+                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ float(lo), float(hi) };
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<float> distribution(lo, hi);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+                library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    template <typename U>
+    void fill_bias_s32(U &&tensor, int i, int32_t min, int32_t max)
+    {
+        std::uniform_int_distribution<int32_t> distribution(min, max);
+        library->fill(tensor, distribution, i);
+    }
+
+    template <typename U, typename D>
+    void fill_constant(U &&tensor, D value)
+    {
+        library->fill_tensor_value(tensor, value);
+    }
+
+    CLTensor compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &output_shape, bool pretranspose_a, bool pretranspose_b, const int M0, const int N0, const int K0,
+                            bool export_rhs_to_cl_image, DataType data_type, const QuantizationInfo &lhs_q_info, const QuantizationInfo &rhs_q_info, const QuantizationInfo &dst_q_info)
+    {
+        CLSynthetizeOperator<KernelType> matMul{};
+        MatMulKernelInfo                 matmul_info;
+        matmul_info.adj_lhs                = pretranspose_a;
+        matmul_info.adj_rhs                = pretranspose_b;
+        matmul_info.m0                     = M0;
+        matmul_info.n0                     = N0;
+        matmul_info.k0                     = K0;
+        matmul_info.export_rhs_to_cl_image = export_rhs_to_cl_image;
+
+        bool is_quantized = is_data_type_quantized(data_type);
+
+        // Create tensors
+        CLTensor a    = create_tensor<CLTensor>(shape_a, data_type, 1, lhs_q_info);
+        CLTensor b    = create_tensor<CLTensor>(shape_b, data_type, 1, rhs_q_info);
+        CLTensor bias = create_tensor<CLTensor>(output_shape[0], (is_quantized) ? DataType::S32 : data_type, 1, dst_q_info);
+        CLTensor dst  = create_tensor<CLTensor>(output_shape, data_type, 1, dst_q_info);
+
+        matMul.configure(a.info(), b.info(), (_enable_bias) ? bias.info() : nullptr, dst.info(), matmul_info);
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        fill(CLAccessor(a), _hash + 1);
+        fill(CLAccessor(b), _hash + 2);
+
+        // Compute matMul kernel
+        ITensorPack tensors_pack({ { ACL_SRC_0, &a },
+            { ACL_SRC_1, &b },
+            { ACL_DST, &dst }
+        });
+
+        if(_enable_bias)
+        {
+            // Allocate, fill and add bias to TensorPack obj
+            bias.allocator()->allocate();
+            if(is_quantized)
+            {
+                fill_bias_s32(CLAccessor(bias), _hash + 3, _min_bias, _max_bias);
+            }
+            else
+            {
+                fill(CLAccessor(bias), _hash + 3);
+            }
+            tensors_pack.add_tensor(ACL_SRC_2, &bias);
+        }
+
+        matMul.run(tensors_pack);
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &output_shape, bool pretranspose_a, bool pretranspose_b, DataType data_type,
+                                      const QuantizationInfo &lhs_q_info, const QuantizationInfo &rhs_q_info, const QuantizationInfo &dst_q_info)
+    {
+        // We collapse dimensions > 3 onto dimension 3, i.e. 5D+ tensors will look like 4D
+        // This is necessary unless we choose to extend gemm reference for 5D+ tensors
+        TensorShape output_shape_collapsed = output_shape.collapsed_from(Window::DimZ);
+        TensorShape shape_a_collapsed      = shape_a.collapsed_from(Window::DimZ);
+        TensorShape shape_b_collapsed      = shape_b.collapsed_from(Window::DimZ);
+
+        // Create reference
+        SimpleTensor<T> a{ shape_a_collapsed, data_type, 1, lhs_q_info };
+        SimpleTensor<T> b{ shape_b_collapsed, data_type, 1, rhs_q_info };
+        SimpleTensor<T> c{ output_shape_collapsed, data_type, 1, dst_q_info };
+
+        // Fill reference
+        fill(a, _hash + 1);
+        fill(b, _hash + 2);
+
+        /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M),
+           therefore, A must be pre-transposed before passing it to the fixture. And, we transpose A again in the fixture to make it (B x M x K)
+           in order to be able to call reference implementation that works with (B x M x K) input.
+           Similarly, if pretranspose_B is set to true, then B is assumed to be (B x N x K), B must be pre-transposed before passing it to the fixture. */
+
+        // Define transposed shapes
+        TensorShape a_transposed_shape(a.shape());
+        a_transposed_shape.set(0, a.shape().y());
+        a_transposed_shape.set(1, a.shape().x());
+
+        TensorShape b_transposed_shape(b.shape());
+        b_transposed_shape.set(0, b.shape().y());
+        b_transposed_shape.set(1, b.shape().x());
+
+        // Define transposed tensors
+        SimpleTensor<T> a_transposed{ a_transposed_shape, data_type };
+        SimpleTensor<T> b_transposed{ b_transposed_shape, data_type };
+
+        // pretranspose a if necessary
+        if(pretranspose_a)
+        {
+            a_transposed = reference::permute<T>(a, PermutationVector(1U, 0U));
+        }
+
+        // pretranspose b if necessary
+        if(pretranspose_b)
+        {
+            b_transposed = reference::permute<T>(b, PermutationVector(1U, 0U));
+        }
+
+        // Use transposed tensors if boolean enabled else use original tensors
+        SimpleTensor<T> result = gemm_reference<T>((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c);
+
+        // We reshape the gemm output back if the tensor is high dimensional
+        if(output_shape_collapsed != output_shape)
+        {
+            result = reference::reshape_layer(result, output_shape);
+        }
+
+        return result;
+    }
+
+    template <typename U = T>
+    typename std::enable_if < std::is_same<U, float>::value || std::is_same<U, half>::value, SimpleTensor<U >>::type gemm_reference(SimpleTensor<U> &a, SimpleTensor<U> &b, SimpleTensor<U> &c)
+    {
+        // Fill bias, then copy first dimension into subsequent dimensions to mimic broadcast
+        // of bias tensor from shape [dst.dimension(0)] to [dst.tensor_shape()] in target kernel
+        if(_enable_bias)
+        {
+            fill(c, _hash + 3);
+            const int n          = c.shape().x();
+            const int other_dims = c.shape().collapsed_from(1)[1];
+            for(int i = 1; i < other_dims; ++i) // For all data, copy first n elements into remaining batches
+            {
+                memcpy(c.data() + i * n, c.data(), n * sizeof(T));
+            }
+        }
+        // Setting beta to 0 will effectively disable C for the
+        // computation of the reference: alpha * A * B + 0 * C
+        return reference::gemm<U>(a, b, c, 1.0f, (_enable_bias) ? 1.0f : 0.f);
+    }
+
+    template <typename U = T>
+    typename std::enable_if < std::is_same<U, int8_t>::value || std::is_same<U, uint8_t>::value, SimpleTensor<U >>::type gemm_reference(SimpleTensor<U> &a, SimpleTensor<U> &b, SimpleTensor<U> &c)
+    {
+        const UniformQuantizationInfo aq = a.quantization_info().uniform();
+        const UniformQuantizationInfo bq = b.quantization_info().uniform();
+        const UniformQuantizationInfo cq = c.quantization_info().uniform();
+
+        const SimpleTensor<int32_t> result = reference::gemmlowp_matrix_multiply_core<int32_t, U, U>(a, b, c.shape(), -aq.offset, -bq.offset);
+
+        std::vector<int32_t> gemmlowp_multipliers{ 1 };
+        std::vector<int32_t> gemmlowp_shifts{ 1 };
+        const int            gemmlowp_offset = cq.offset;
+        const float          scale           = aq.scale * bq.scale / cq.scale;
+
+        quantization::calculate_quantized_multiplier(scale, &gemmlowp_multipliers[0], &gemmlowp_shifts[0]);
+        constexpr int32_t gemmlowp_min_bound = std::numeric_limits<int32_t>::min();
+        constexpr int32_t gemmlowp_max_bound = std::numeric_limits<int32_t>::max();
+
+        SimpleTensor<int> bias{ c.shape(), DataType::S32 };
+        if(_enable_bias)
+        {
+            // Identical to float implementation, fill and copy values of bias first dimension
+            fill_bias_s32(bias, _hash + 3, _min_bias, _max_bias);
+            const int          n          = bias.shape().x();
+            const int          other_dims = bias.shape().collapsed_from(1)[1];
+            const unsigned int dt_size    = sizeof(int32_t);
+            for(int i = 1; i < other_dims; ++i)
+            {
+                memcpy(bias.data() + i * n, bias.data(), n * dt_size);
+            }
+        }
+        else
+        {
+            fill_constant(bias, static_cast<int32_t>(0)); // effectively disable bias
+        }
+
+        const SimpleTensor<U> final_result = reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, U>(result, bias,
+                                                                                                               gemmlowp_multipliers, gemmlowp_shifts, gemmlowp_offset, gemmlowp_min_bound, gemmlowp_max_bound);
+
+        return final_result;
+    }
+
+    CLTensor        _target{};
+    SimpleTensor<T> _reference{};
+    bool            _enable_bias{ false };
+    bool            _device_supports_export_to_cl_image{ true };
+    bool            _device_supports_mmul{ true };
+    int32_t         _min_bias{ 0 };
+    int32_t         _max_bias{ 0 };
+    int32_t         _hash{ 0 };
+};
+
+template <typename T, typename KernelType, bool use_mmul = false>
+class MatMulKernelValidationFixture : public MatMulKernelGenericValidationFixture<T, KernelType, use_mmul>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool pretranspose_a, bool pretranspose_b, int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type)
+    {
+        MatMulKernelGenericValidationFixture<T, KernelType, use_mmul>::setup(shape_a, shape_b, output_shape, pretranspose_a, pretranspose_b, M0, N0, K0, export_rhs_to_cl_image, data_type,
+                                                                             false /* enable bias */);
+    }
+};
+
+template <typename T, typename KernelType, bool use_mmul = false>
+class MatMulKernelWithBiasValidation : public MatMulKernelGenericValidationFixture<T, KernelType, use_mmul>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape output_shape, bool pretranspose_a, bool pretranspose_b, int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type)
+    {
+        MatMulKernelGenericValidationFixture<T, KernelType, use_mmul>::setup(shape_a, shape_b, output_shape, pretranspose_a, pretranspose_b, M0, N0, K0, export_rhs_to_cl_image, data_type,
+                                                                             true /* enable bias */);
+    }
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_MATMULKERNELFIXTURE_H
diff --git a/tests/validation/fixtures/MaxUnpoolingLayerFixture.h b/tests/validation/fixtures/MaxUnpoolingLayerFixture.h
index 7c118da319..808e3ffabd 100644
--- a/tests/validation/fixtures/MaxUnpoolingLayerFixture.h
+++ b/tests/validation/fixtures/MaxUnpoolingLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,6 @@ template <typename TensorType, typename AccessorType, typename PoolingFunctionTy
 class MaxUnpoolingLayerValidationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type, DataLayout data_layout)
     {
         std::mt19937                    gen(library->seed());
@@ -106,9 +105,9 @@ protected:
         MaxUnpoolingFunctionType unpool_layer;
         unpool_layer.configure(&dst, &indices, &unpooled, pool_info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(indices.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(indices.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -116,10 +115,10 @@ protected:
         indices.allocator()->allocate();
         unpooled.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!indices.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!unpooled.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!indices.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!unpooled.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
@@ -150,7 +149,6 @@ template <typename TensorType, typename AccessorType, typename F1, typename F2,
 class MaxUnpoolingLayerValidationFixture : public MaxUnpoolingLayerValidationGenericFixture<TensorType, AccessorType, F1, F2, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, DataType data_type, DataLayout data_layout)
     {
         MaxUnpoolingLayerValidationGenericFixture<TensorType, AccessorType, F1, F2, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, data_layout, pad_stride_info, true),
diff --git a/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h b/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h
index 5d11d1f8e2..bf5d20790c 100644
--- a/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h
+++ b/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,30 +44,35 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class MeanStdDevNormalizationLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
-    void setup(TensorShape shape, DataType dt, bool in_place, float epsilon = 1e-8f)
+    void setup(TensorShape shape, DataType dt, bool in_place, float epsilon = 1e-8)
     {
-        _data_type = dt;
-        _target    = compute_target(shape, dt, in_place, epsilon);
-        _reference = compute_reference(shape, dt, epsilon);
+        QuantizationInfo qi = QuantizationInfo(0.5f, 10);
+        _data_type          = dt;
+        _target             = compute_target(shape, dt, in_place, epsilon, qi);
+        _reference          = compute_reference(shape, dt, epsilon, qi);
     }
 
 protected:
     template <typename U>
-    void fill(U &&src_tensor)
+    void fill(U &&tensor)
     {
-        static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
-        using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
-        DistributionType distribution{ T(-1.0f), T(1.0f) };
-        library->fill(src_tensor, distribution, 0);
+        if(is_data_type_float(_data_type))
+        {
+            std::uniform_real_distribution<> distribution{ -1.0f, 1.0f };
+            library->fill(tensor, distribution, 0);
+        }
+        else
+        {
+            std::uniform_int_distribution<> distribution{ 0, 255 };
+            library->fill(tensor, distribution, 0);
+        }
     }
 
-    TensorType compute_target(TensorShape shape, DataType dt, bool in_place, float epsilon)
+    TensorType compute_target(TensorShape shape, DataType dt, bool in_place, float epsilon, QuantizationInfo qi)
     {
         // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, dt, 1);
-        TensorType dst;
+        TensorType src = create_tensor<TensorType>(shape, dt, 1, qi);
+        TensorType dst = create_tensor<TensorType>(shape, dt, 1, qi);
 
         TensorType *dst_ptr = in_place ? &src : &dst;
 
@@ -75,17 +80,17 @@ protected:
         FunctionType norm;
         norm.configure(&src, dst_ptr, epsilon);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
 
         if(!in_place)
         {
             dst.allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
         }
 
         // Fill tensors
@@ -104,10 +109,10 @@ protected:
         }
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType dt, float epsilon)
+    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType dt, float epsilon, QuantizationInfo qi)
     {
         // Create reference
-        SimpleTensor<T> ref_src{ shape, dt, 1 };
+        SimpleTensor<T> ref_src{ shape, dt, 1, qi };
 
         // Fill reference
         fill(ref_src);
@@ -119,6 +124,7 @@ protected:
     SimpleTensor<T> _reference{};
     DataType        _data_type{};
 };
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/fixtures/NonMaxSuppressionFixture.h b/tests/validation/fixtures/NonMaxSuppressionFixture.h
index 6d5fc437ce..043b4731aa 100644
--- a/tests/validation/fixtures/NonMaxSuppressionFixture.h
+++ b/tests/validation/fixtures/NonMaxSuppressionFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType>
 class NMSValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, unsigned int max_output_size, float score_threshold, float nms_threshold)
     {
         ARM_COMPUTE_ERROR_ON(max_output_size == 0);
@@ -77,18 +76,18 @@ protected:
         FunctionType nms_func;
         nms_func.configure(&bboxes, &scores, &indices, max_output_size, score_threshold, nms_threshold);
 
-        ARM_COMPUTE_EXPECT(bboxes.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(indices.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(scores.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(bboxes.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(indices.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(scores.info()->is_resizable());
 
         // Allocate tensors
         bboxes.allocator()->allocate();
         indices.allocator()->allocate();
         scores.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!bboxes.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!indices.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!scores.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!bboxes.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!indices.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!scores.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(bboxes), 0, 0.f, 1.f);
diff --git a/tests/validation/fixtures/NormalizationLayerFixture.h b/tests/validation/fixtures/NormalizationLayerFixture.h
index 54570de64f..ddaa3533f5 100644
--- a/tests/validation/fixtures/NormalizationLayerFixture.h
+++ b/tests/validation/fixtures/NormalizationLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class NormalizationValidationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, NormType norm_type, int norm_size, float beta, bool is_scaled, DataType data_type, DataLayout data_layout)
     {
         NormalizationLayerInfo info(norm_type, norm_size, 5, beta, 1.f, is_scaled);
@@ -81,15 +80,15 @@ protected:
         FunctionType norm_layer;
         norm_layer.configure(&src, &dst, info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
@@ -119,7 +118,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class NormalizationValidationFixture : public NormalizationValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, NormType norm_type, int norm_size, float beta, bool is_scaled, DataType data_type, DataLayout data_layout)
     {
         NormalizationValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, norm_type, norm_size, beta, is_scaled, data_type, data_layout);
diff --git a/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h b/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h
index 3249ccc4bb..5f2c865950 100644
--- a/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h
+++ b/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class NormalizePlanarYUVLayerValidationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape0, TensorShape shape1, DataType dt, DataLayout data_layout, QuantizationInfo quantization_info)
     {
         _data_type = dt;
@@ -97,10 +96,10 @@ protected:
         FunctionType norm;
         norm.configure(&src, &dst, &mean, &std);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(mean.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(std.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(mean.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(std.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -108,10 +107,10 @@ protected:
         mean.allocator()->allocate();
         std.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!mean.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!std.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!mean.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!std.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), AccessorType(mean), AccessorType(std));
@@ -144,7 +143,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class NormalizePlanarYUVLayerValidationFixture : public NormalizePlanarYUVLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape0, TensorShape shape1, DataType dt, DataLayout data_layout)
     {
         NormalizePlanarYUVLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape0, shape1, dt, data_layout, QuantizationInfo());
@@ -155,7 +153,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class NormalizePlanarYUVLayerValidationQuantizedFixture : public NormalizePlanarYUVLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape0, TensorShape shape1, DataType dt, DataLayout data_layout, QuantizationInfo quantization_info)
     {
         NormalizePlanarYUVLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape0, shape1, dt, data_layout, quantization_info);
diff --git a/tests/validation/fixtures/PadLayerFixture.h b/tests/validation/fixtures/PadLayerFixture.h
index 2279c8b2b3..93b43616ff 100644
--- a/tests/validation/fixtures/PadLayerFixture.h
+++ b/tests/validation/fixtures/PadLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PaddingFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, const PaddingList &padding, const PaddingMode mode)
     {
         PaddingList clamped_padding = padding;
@@ -94,15 +93,15 @@ protected:
         FunctionType padding;
         padding.configure(&src, &dst, paddings, const_value, mode);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
diff --git a/tests/validation/fixtures/PermuteFixture.h b/tests/validation/fixtures/PermuteFixture.h
index 9bbc0cba5e..b1b3845a8d 100644
--- a/tests/validation/fixtures/PermuteFixture.h
+++ b/tests/validation/fixtures/PermuteFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PermuteValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, PermutationVector perm, DataType data_type)
     {
         _target    = compute_target(input_shape, data_type, perm);
@@ -73,15 +72,15 @@ protected:
         FunctionType perm_func;
         perm_func.configure(&src, &dst, perm);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/PixelWiseMultiplicationFixture.h b/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
index 4eb83859ac..4345d8a13f 100644
--- a/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
+++ b/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PixelWiseMultiplicationGenericValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape0,
                const TensorShape &shape1,
                DataType            dt_in1,
@@ -76,24 +75,45 @@ protected:
                               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, ActivationLayerInfo act_info)
     {
         // Create tensors
-        TensorType src1 = create_tensor<TensorType>(shape0, dt_in1, 1, qinfo0);
-        TensorType src2 = create_tensor<TensorType>(shape1, dt_in2, 1, qinfo1);
-        TensorType dst  = create_tensor<TensorType>(TensorShape::broadcast_shape(shape0, shape1), dt_out, 1, qinfo_out);
+        const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1);
+        TensorType        src1      = create_tensor<TensorType>(shape0, dt_in1, 1, qinfo0);
+        TensorType        src2      = create_tensor<TensorType>(shape1, dt_in2, 1, qinfo1);
+        TensorType        dst       = create_tensor<TensorType>(out_shape, dt_out, 1, qinfo_out);
+
+        // Check whether do in-place computation and whether inputs are broadcast compatible
+        TensorType *actual_dst = &dst;
+        if(_is_inplace)
+        {
+            bool src1_is_inplace = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) && (qinfo0 == qinfo_out) && (dt_in1 == dt_out);
+            bool src2_is_inplace = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) && (qinfo1 == qinfo_out) && (dt_in2 == dt_out);
+            bool do_in_place     = out_shape.total_size() != 0 && (src1_is_inplace || src2_is_inplace);
+            ARM_COMPUTE_ASSERT(do_in_place);
+
+            if(src1_is_inplace)
+            {
+                actual_dst = &src1;
+            }
+            else
+            {
+                actual_dst = &src2;
+            }
+        }
 
         auto allocate_tensor = [](TensorType & t)
         {
-            ARM_COMPUTE_EXPECT(t.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(t.info()->is_resizable());
             t.allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!t.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!t.info()->is_resizable());
         };
 
         // Create and configure function
         FunctionType multiply;
-        multiply.configure(&src1, &src2, (_is_inplace ? &src1 : &dst), scale, convert_policy, rounding_policy, act_info);
+        multiply.configure(&src1, &src2, actual_dst, scale, convert_policy, rounding_policy, act_info);
 
         allocate_tensor(src1);
         allocate_tensor(src2);
 
+        // If don't do in-place computation, still need to allocate original dst
         if(!_is_inplace)
         {
             allocate_tensor(dst);
@@ -106,12 +126,7 @@ protected:
         // Compute function
         multiply.run();
 
-        if(_is_inplace)
-        {
-            return src1;
-        }
-
-        return dst;
+        return std::move(*actual_dst);
     }
 
     SimpleTensor<T3> compute_reference(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, DataType dt_out,
@@ -122,16 +137,12 @@ protected:
         SimpleTensor<T1> src1{ shape0, dt_in1, 1, qinfo0 };
         SimpleTensor<T2> src2{ shape1, dt_in2, 1, qinfo1 };
 
-        // current in-place implementation only supports same metadata of input and output tensors.
-        // By ignoring output quantization information here, we can make test cases implementation much simpler.
-        QuantizationInfo output_qinfo = _is_inplace ? qinfo0 : qinfo_out;
-
         // Fill reference
         fill(src1, 0);
         fill(src2, 1);
 
-        auto result = reference::pixel_wise_multiplication<T1, T2, T3>(src1, src2, scale, convert_policy, rounding_policy, dt_out, output_qinfo);
-        return act_info.enabled() ? reference::activation_layer(result, act_info, output_qinfo) : result;
+        auto result = reference::pixel_wise_multiplication<T1, T2, T3>(src1, src2, scale, convert_policy, rounding_policy, dt_out, qinfo_out);
+        return act_info.enabled() ? reference::activation_layer(result, act_info, qinfo_out) : result;
     }
 
     TensorType       _target{};
@@ -143,7 +154,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PixelWiseMultiplicationValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, bool is_inplace)
     {
         PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>::setup(shape, shape, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy,
@@ -155,7 +165,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PixelWiseMultiplicationBroadcastValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
                bool is_inplace)
     {
@@ -168,7 +177,17 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PixelWiseMultiplicationValidationFloatFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
 {
 public:
-    template <typename...>
+    void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, ActivationLayerInfo act_info, bool is_inplace)
+    {
+        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, shape, dt_in1, dt_in2, dt_in2, scale, convert_policy, rounding_policy,
+                                                                                                               QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
+class PixelWiseMultiplicationValidationIntegerFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
+{
+public:
     void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, ActivationLayerInfo act_info, bool is_inplace)
     {
         PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, shape, dt_in1, dt_in2, dt_in2, scale, convert_policy, rounding_policy,
@@ -180,7 +199,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PixelWiseMultiplicationBroadcastValidationFloatFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
                ActivationLayerInfo act_info, bool is_inplace)
     {
@@ -193,7 +211,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PixelWiseMultiplicationValidationQuantizedFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
                QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
     {
@@ -206,7 +223,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PixelWiseMultiplicationBroadcastValidationQuantizedFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>
 {
 public:
-    template <typename...>
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
                QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
     {
diff --git a/tests/validation/fixtures/Pooling3dLayerFixture.h b/tests/validation/fixtures/Pooling3dLayerFixture.h
new file mode 100644
index 0000000000..1bdf615fb1
--- /dev/null
+++ b/tests/validation/fixtures/Pooling3dLayerFixture.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_POOLING_3D_LAYER_FIXTURE
+#define ARM_COMPUTE_TEST_POOLING_3D_LAYER_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/Pooling3dLayer.h"
+#include <random>
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class Pooling3dLayerValidationGenericFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape shape, Pooling3dLayerInfo pool_info, DataType data_type, QuantizationInfo input_qinfo = QuantizationInfo(), QuantizationInfo output_qinfo = QuantizationInfo())
+    {
+        _target    = compute_target(shape, pool_info, data_type, input_qinfo, output_qinfo);
+        _reference = compute_reference(shape, pool_info, data_type, input_qinfo, output_qinfo);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        if(tensor.data_type() == DataType::F32)
+        {
+            std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+            library->fill(tensor, distribution, 0);
+        }
+        else if(tensor.data_type() == DataType::F16)
+        {
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
+            library->fill(tensor, distribution, 0);
+        }
+        else // data type is quantized_asymmetric
+        {
+            library->fill_tensor_uniform(tensor, 0);
+        }
+    }
+
+    TensorType compute_target(TensorShape shape, Pooling3dLayerInfo info,
+                              DataType data_type, QuantizationInfo input_qinfo, QuantizationInfo output_qinfo)
+    {
+        // Create tensors
+        TensorType        src       = create_tensor<TensorType>(shape, data_type, 1, input_qinfo, DataLayout::NDHWC);
+        const TensorShape dst_shape = misc::shape_calculator::compute_pool3d_shape((src.info()->tensor_shape()), info);
+        TensorType        dst       = create_tensor<TensorType>(dst_shape, data_type, 1, output_qinfo, DataLayout::NDHWC);
+
+        // Create and configure function
+        FunctionType pool_layer;
+        pool_layer.validate(src.info(), dst.info(), info);
+        pool_layer.configure(&src, &dst, info);
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        fill(AccessorType(src));
+
+        // Compute function
+        pool_layer.run();
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(TensorShape shape, Pooling3dLayerInfo info, DataType data_type, QuantizationInfo input_qinfo, QuantizationInfo output_qinfo)
+    {
+        // Create reference
+        SimpleTensor<T> src(shape, data_type, 1, input_qinfo, DataLayout::NDHWC);
+        // Fill reference
+        fill(src);
+        return reference::pooling_3d_layer<T>(src, info, output_qinfo);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class Pooling3dLayerValidationFixture : public Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape shape, PoolingType pool_type, Size3D pool_size, Size3D stride, Padding3D padding, bool exclude_padding, DataType data_type)
+    {
+        Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, Pooling3dLayerInfo(pool_type, pool_size, stride, padding, exclude_padding),
+                                                                                                 data_type);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class Pooling3dLayerValidationQuantizedFixture : public Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape shape, PoolingType pool_type, Size3D pool_size, Size3D stride, Padding3D padding, bool exclude_padding, DataType data_type,
+               QuantizationInfo input_qinfo = QuantizationInfo(), QuantizationInfo output_qinfo = QuantizationInfo())
+    {
+        Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, Pooling3dLayerInfo(pool_type, pool_size, stride, padding, exclude_padding),
+                                                                                                 data_type, input_qinfo, output_qinfo);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class Pooling3dLayerGlobalValidationFixture : public Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape shape, PoolingType pool_type, DataType data_type)
+    {
+        Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, Pooling3dLayerInfo(pool_type), data_type);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class SpecialPooling3dLayerValidationFixture : public Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape src_shape, Pooling3dLayerInfo pool_info, DataType data_type)
+    {
+        Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, pool_info, data_type);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_POOLING_3D_LAYER_FIXTURE */
diff --git a/tests/validation/fixtures/PoolingLayerFixture.h b/tests/validation/fixtures/PoolingLayerFixture.h
index af078d4ce3..59c920868b 100644
--- a/tests/validation/fixtures/PoolingLayerFixture.h
+++ b/tests/validation/fixtures/PoolingLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,16 +45,31 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PoolingLayerValidationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type, DataLayout data_layout, bool indices = false,
-               QuantizationInfo input_qinfo = QuantizationInfo(), QuantizationInfo output_qinfo = QuantizationInfo())
+               QuantizationInfo input_qinfo = QuantizationInfo(), QuantizationInfo output_qinfo = QuantizationInfo(), bool mixed_layout = false)
     {
-        _pool_info = pool_info;
-        _target    = compute_target(shape, pool_info, data_type, data_layout, input_qinfo, output_qinfo, indices);
-        _reference = compute_reference(shape, pool_info, data_type, data_layout, input_qinfo, output_qinfo, indices);
+        _mixed_layout = mixed_layout;
+        _pool_info    = pool_info;
+        _target       = compute_target(shape, pool_info, data_type, data_layout, input_qinfo, output_qinfo, indices);
+        _reference    = compute_reference(shape, pool_info, data_type, data_layout, input_qinfo, output_qinfo, indices);
     }
 
 protected:
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
+    {
+        const DataLayout data_layout = src.info()->data_layout();
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+
+        // Compute Convolution function
+        layer.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(data_layout);
+        dst.info()->set_data_layout(data_layout);
+    }
+
     template <typename U>
     void fill(U &&tensor)
     {
@@ -94,25 +109,33 @@ protected:
         FunctionType pool_layer;
         pool_layer.configure(&src, &dst, info, (indices) ? &_target_indices : nullptr);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(_target_indices.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(_target_indices.info()->is_resizable());
+
+        add_padding_x({ &src, &dst, &_target_indices }, data_layout);
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
         _target_indices.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!_target_indices.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!_target_indices.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
 
-        // Compute function
-        pool_layer.run();
-
+        if(_mixed_layout)
+        {
+            mix_layout(pool_layer, src, dst);
+        }
+        else
+        {
+            // Compute function
+            pool_layer.run();
+        }
         return dst;
     }
 
@@ -129,6 +152,7 @@ protected:
     TensorType             _target{};
     SimpleTensor<T>        _reference{};
     PoolingLayerInfo       _pool_info{};
+    bool                   _mixed_layout{ false };
     TensorType             _target_indices{};
     SimpleTensor<uint32_t> _ref_indices{};
 };
@@ -136,23 +160,22 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PoolingLayerIndicesValidationFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
-    void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, DataLayout data_layout)
+    void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, DataLayout data_layout, bool use_kernel_indices)
     {
-        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, data_layout, pad_stride_info, exclude_padding),
+        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, data_layout, pad_stride_info, exclude_padding, false,
+                                                                                                                       true, use_kernel_indices),
                                                                                                data_type, data_layout, true);
     }
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class PoolingLayerValidationFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, DataLayout data_layout)
     {
         PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, data_layout, pad_stride_info, exclude_padding),
-                                                                                               data_type, data_layout);
+                                                                                               data_type, data_layout, false, mixed_layout);
     }
 };
 
@@ -160,7 +183,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PoolingLayerValidationMixedPrecisionFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, DataLayout data_layout, bool fp_mixed_precision = false)
     {
         PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, data_layout, pad_stride_info, exclude_padding, fp_mixed_precision),
@@ -168,16 +190,15 @@ public:
     }
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class PoolingLayerValidationQuantizedFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, DataLayout data_layout = DataLayout::NCHW,
                QuantizationInfo input_qinfo = QuantizationInfo(), QuantizationInfo output_qinfo = QuantizationInfo())
     {
         PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, data_layout, pad_stride_info, exclude_padding),
-                                                                                               data_type, data_layout, false, input_qinfo, output_qinfo);
+                                                                                               data_type, data_layout, false, input_qinfo, output_qinfo, mixed_layout);
     }
 };
 
@@ -185,10 +206,9 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class SpecialPoolingLayerValidationFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape src_shape, PoolingLayerInfo pool_info, DataType data_type)
     {
-        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, pool_info, data_type, DataLayout::NCHW);
+        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, pool_info, data_type, pool_info.data_layout);
     }
 };
 
@@ -196,7 +216,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class GlobalPoolingLayerValidationFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, PoolingType pool_type, DataType data_type, DataLayout data_layout = DataLayout::NCHW)
     {
         PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, data_layout), data_type, data_layout);
diff --git a/tests/validation/fixtures/PriorBoxLayerFixture.h b/tests/validation/fixtures/PriorBoxLayerFixture.h
index ef18c0d787..0a76cfd155 100644
--- a/tests/validation/fixtures/PriorBoxLayerFixture.h
+++ b/tests/validation/fixtures/PriorBoxLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class PriorBoxLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, PriorBoxLayerInfo info, DataType data_type, DataLayout data_layout)
     {
         TensorInfo        input_info(input_shape, 1, data_type);
@@ -73,18 +72,18 @@ protected:
         FunctionType prior_box;
         prior_box.configure(&src1, &src2, &dst, info);
 
-        ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src1.allocator()->allocate();
         src2.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src1.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!src2.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Compute function
         prior_box.run();
diff --git a/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h b/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h
index 0cf2ef04f7..e864b4affe 100644
--- a/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h
+++ b/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class QLSTMLayerNormalizationValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weight_shape, TensorShape bias_shape, DataType data_type, QuantizationInfo weight_qinfo)
     {
         ARM_COMPUTE_ERROR_ON(data_type != DataType::QSYMM16);
@@ -91,9 +90,9 @@ protected:
     {
         for(auto t : tensors)
         {
-            ARM_COMPUTE_EXPECT(t->info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(t->info()->is_resizable());
             t->allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!t->info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!t->info()->is_resizable());
         }
     }
 
diff --git a/tests/validation/fixtures/QuantizationLayerFixture.h b/tests/validation/fixtures/QuantizationLayerFixture.h
index 4f46f99ff5..1b21967bda 100644
--- a/tests/validation/fixtures/QuantizationLayerFixture.h
+++ b/tests/validation/fixtures/QuantizationLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class QuantizationValidationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type_in, DataType data_type_out, QuantizationInfo qinfo, QuantizationInfo qinfo_in)
     {
         _target    = compute_target(shape, data_type_in, data_type_out, qinfo, qinfo_in);
@@ -70,15 +69,15 @@ protected:
         FunctionType quantization_layer;
         quantization_layer.configure(&src, &dst);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
@@ -108,7 +107,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class QuantizationValidationFixture : public QuantizationValidationGenericFixture<TensorType, AccessorType, FunctionType, Tin, Tout>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type_in, DataType data_type_out, QuantizationInfo qinfo)
     {
         QuantizationValidationGenericFixture<TensorType, AccessorType, FunctionType, Tin, Tout>::setup(shape, data_type_in, data_type_out, qinfo, QuantizationInfo());
diff --git a/tests/validation/fixtures/RNNLayerFixture.h b/tests/validation/fixtures/RNNLayerFixture.h
index 394d91cd6f..e9a05e7838 100644
--- a/tests/validation/fixtures/RNNLayerFixture.h
+++ b/tests/validation/fixtures/RNNLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class RNNLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape recurrent_weights_shape, TensorShape bias_shape, TensorShape output_shape, ActivationLayerInfo info,
                DataType data_type)
     {
@@ -76,12 +75,12 @@ protected:
         FunctionType rnn;
         rnn.configure(&input, &weights, &recurrent_weights, &bias, &hidden_state, &output, info);
 
-        ARM_COMPUTE_EXPECT(input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(recurrent_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(hidden_state.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(recurrent_weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(hidden_state.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(output.info()->is_resizable());
 
         // Allocate tensors
         input.allocator()->allocate();
@@ -91,12 +90,12 @@ protected:
         hidden_state.allocator()->allocate();
         output.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!recurrent_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!hidden_state.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!recurrent_weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!hidden_state.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!output.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(input), 0);
diff --git a/tests/validation/fixtures/ROIAlignLayerFixture.h b/tests/validation/fixtures/ROIAlignLayerFixture.h
index c631c24cff..ad76dcbbd9 100644
--- a/tests/validation/fixtures/ROIAlignLayerFixture.h
+++ b/tests/validation/fixtures/ROIAlignLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ROIAlignLayerGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, const ROIPoolingLayerInfo pool_info, TensorShape rois_shape, DataType data_type, DataLayout data_layout, QuantizationInfo qinfo, QuantizationInfo output_qinfo)
     {
         _rois_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::QASYMM16 : data_type;
@@ -138,18 +137,18 @@ protected:
         FunctionType roi_align_layer;
         roi_align_layer.configure(&src, &rois_tensor, &dst, pool_info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(rois_tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rois_tensor.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         rois_tensor.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!rois_tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rois_tensor.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
@@ -189,7 +188,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ROIAlignLayerFixture : public ROIAlignLayerGenericFixture<TensorType, AccessorType, FunctionType, T, TRois>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, const ROIPoolingLayerInfo pool_info, TensorShape rois_shape, DataType data_type, DataLayout data_layout)
     {
         ROIAlignLayerGenericFixture<TensorType, AccessorType, FunctionType, T, TRois>::setup(input_shape, pool_info, rois_shape, data_type, data_layout,
@@ -201,7 +199,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ROIAlignLayerQuantizedFixture : public ROIAlignLayerGenericFixture<TensorType, AccessorType, FunctionType, T, TRois>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, const ROIPoolingLayerInfo pool_info, TensorShape rois_shape, DataType data_type,
                DataLayout data_layout, QuantizationInfo qinfo, QuantizationInfo output_qinfo)
     {
diff --git a/tests/validation/fixtures/ROIPoolingLayerFixture.h b/tests/validation/fixtures/ROIPoolingLayerFixture.h
new file mode 100644
index 0000000000..4b46a6176d
--- /dev/null
+++ b/tests/validation/fixtures/ROIPoolingLayerFixture.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_ROIPOOLINGLAYER_FIXTURE
+#define ARM_COMPUTE_TEST_ROIPOOLINGLAYER_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/ROIPoolingLayer.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ROIPoolingLayerGenericFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape input_shape, const ROIPoolingLayerInfo pool_info, TensorShape rois_shape, DataType data_type, DataLayout data_layout, QuantizationInfo qinfo, QuantizationInfo output_qinfo)
+    {
+        _target    = compute_target(input_shape, data_type, data_layout, pool_info, rois_shape, qinfo, output_qinfo);
+        _reference = compute_reference(input_shape, data_type, pool_info, rois_shape, qinfo, output_qinfo);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        library->fill_tensor_uniform(tensor, 0);
+    }
+
+    template <typename U>
+    void generate_rois(U &&rois, const TensorShape &shape, const ROIPoolingLayerInfo &pool_info, TensorShape rois_shape, DataLayout data_layout = DataLayout::NCHW)
+    {
+        const size_t values_per_roi = rois_shape.x();
+        const size_t num_rois       = rois_shape.y();
+
+        std::mt19937 gen(library->seed());
+        uint16_t    *rois_ptr = static_cast<uint16_t *>(rois.data());
+
+        const float pool_width  = pool_info.pooled_width();
+        const float pool_height = pool_info.pooled_height();
+        const float roi_scale   = pool_info.spatial_scale();
+
+        // Calculate distribution bounds
+        const auto scaled_width  = static_cast<float>((shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)] / roi_scale) / pool_width);
+        const auto scaled_height = static_cast<float>((shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)] / roi_scale) / pool_height);
+        const auto min_width     = static_cast<float>(pool_width / roi_scale);
+        const auto min_height    = static_cast<float>(pool_height / roi_scale);
+
+        // Create distributions
+        std::uniform_int_distribution<int> dist_batch(0, shape[3] - 1);
+        std::uniform_int_distribution<>    dist_x1(0, scaled_width);
+        std::uniform_int_distribution<>    dist_y1(0, scaled_height);
+        std::uniform_int_distribution<>    dist_w(min_width, std::max(float(min_width), (pool_width - 2) * scaled_width));
+        std::uniform_int_distribution<>    dist_h(min_height, std::max(float(min_height), (pool_height - 2) * scaled_height));
+
+        for(unsigned int pw = 0; pw < num_rois; ++pw)
+        {
+            const auto batch_idx = dist_batch(gen);
+            const auto x1        = dist_x1(gen);
+            const auto y1        = dist_y1(gen);
+            const auto x2        = x1 + dist_w(gen);
+            const auto y2        = y1 + dist_h(gen);
+
+            rois_ptr[values_per_roi * pw]     = batch_idx;
+            rois_ptr[values_per_roi * pw + 1] = static_cast<uint16_t>(x1);
+            rois_ptr[values_per_roi * pw + 2] = static_cast<uint16_t>(y1);
+            rois_ptr[values_per_roi * pw + 3] = static_cast<uint16_t>(x2);
+            rois_ptr[values_per_roi * pw + 4] = static_cast<uint16_t>(y2);
+        }
+    }
+
+    TensorType compute_target(TensorShape                input_shape,
+                              DataType                   data_type,
+                              DataLayout                 data_layout,
+                              const ROIPoolingLayerInfo &pool_info,
+                              const TensorShape          rois_shape,
+                              const QuantizationInfo    &qinfo,
+                              const QuantizationInfo    &output_qinfo)
+    {
+        const QuantizationInfo rois_qinfo = is_data_type_quantized(data_type) ? QuantizationInfo(0.125f, 0) : QuantizationInfo();
+
+        // Create tensors
+        TensorType src         = create_tensor<TensorType>(input_shape, data_type, 1, qinfo, data_layout);
+        TensorType rois_tensor = create_tensor<TensorType>(rois_shape, _rois_data_type, 1, rois_qinfo);
+
+        // Initialise shape and declare output tensor dst
+        const TensorShape dst_shape;
+        TensorType        dst = create_tensor<TensorType>(dst_shape, data_type, 1, output_qinfo, data_layout);
+
+        // Create and configure function
+        FunctionType roi_pool_layer;
+        roi_pool_layer.configure(&src, &rois_tensor, &dst, pool_info);
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rois_tensor.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        rois_tensor.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rois_tensor.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        fill(AccessorType(src));
+        generate_rois(AccessorType(rois_tensor), input_shape, pool_info, rois_shape, data_layout);
+
+        // Compute function
+        roi_pool_layer.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape         &input_shape,
+                                      DataType                   data_type,
+                                      const ROIPoolingLayerInfo &pool_info,
+                                      const TensorShape          rois_shape,
+                                      const QuantizationInfo    &qinfo,
+                                      const QuantizationInfo    &output_qinfo)
+    {
+        // Create reference tensor
+        SimpleTensor<T>        src{ input_shape, data_type, 1, qinfo };
+        const QuantizationInfo rois_qinfo = is_data_type_quantized(data_type) ? QuantizationInfo(0.125f, 0) : QuantizationInfo();
+        SimpleTensor<uint16_t> rois_tensor{ rois_shape, _rois_data_type, 1, rois_qinfo };
+
+        // Fill reference tensor
+        fill(src);
+        generate_rois(rois_tensor, input_shape, pool_info, rois_shape);
+
+        return reference::roi_pool_layer(src, rois_tensor, pool_info, output_qinfo);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+    const DataType  _rois_data_type{ DataType::U16 };
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ROIPoolingLayerQuantizedFixture : public ROIPoolingLayerGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, const ROIPoolingLayerInfo pool_info, TensorShape rois_shape, DataType data_type,
+               DataLayout data_layout, QuantizationInfo qinfo, QuantizationInfo output_qinfo)
+    {
+        ROIPoolingLayerGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, pool_info, rois_shape,
+                                                                                        data_type, data_layout, qinfo, output_qinfo);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ROIPoolingLayerFixture : public ROIPoolingLayerGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, const ROIPoolingLayerInfo pool_info, TensorShape rois_shape, DataType data_type, DataLayout data_layout)
+    {
+        ROIPoolingLayerGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, pool_info, rois_shape, data_type, data_layout,
+                                                                                        QuantizationInfo(), QuantizationInfo());
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_TEST_ROIPOOLINGLAYER_FIXTURE */
+\ No newline at end of file
diff --git a/tests/validation/fixtures/RangeFixture.h b/tests/validation/fixtures/RangeFixture.h
index 0713db9237..166613a318 100644
--- a/tests/validation/fixtures/RangeFixture.h
+++ b/tests/validation/fixtures/RangeFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class RangeFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(const DataType data_type0, float start, float step, const QuantizationInfo qinfo0 = QuantizationInfo())
     {
         _target    = compute_target(data_type0, qinfo0, start, step);
@@ -113,11 +112,11 @@ protected:
         FunctionType range_func;
         range_func.configure(&dst, start, end, step);
 
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
         // Allocate tensors
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Compute function
         range_func.run();
diff --git a/tests/validation/fixtures/ReduceMeanFixture.h b/tests/validation/fixtures/ReduceMeanFixture.h
index 36bf14b27e..e61941435c 100644
--- a/tests/validation/fixtures/ReduceMeanFixture.h
+++ b/tests/validation/fixtures/ReduceMeanFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReduceMeanValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info_input, QuantizationInfo quantization_info_output)
     {
         _target    = compute_target(shape, data_type, axis, keep_dims, quantization_info_input, quantization_info_output);
@@ -92,15 +91,15 @@ protected:
         FunctionType reduction_mean;
         reduction_mean.configure(&src, axis, keep_dims, &dst);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
@@ -124,7 +123,13 @@ protected:
         {
             TensorShape output_shape = i == 0 ? src_shape : out.shape();
             output_shape.set(axis[i], 1);
-            out = reference::reduction_operation<T, T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM, quantization_info_output);
+            bool is_opencl = false;
+
+#ifdef ARM_COMPUTE_OPENCL_ENABLED
+            is_opencl = std::is_same<CLTensor, TensorType>::value; // Round down to zero on opencl to match kernel
+#endif                                                             /* ARM_COMPUTE_OPENCL_ENABLED */
+            out = reference::reduction_operation<T, T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM, data_type, quantization_info_output,
+                                                       is_opencl ? RoundingPolicy::TO_ZERO : RoundingPolicy::TO_NEAREST_UP);
         }
 
         if(!keep_dims)
@@ -133,7 +138,7 @@ protected:
             std::sort(axis.begin(), axis.begin() + axis.num_dimensions());
             for(unsigned int i = 0; i < axis.num_dimensions(); ++i)
             {
-                output_shape.remove_dimension(axis[i] - i);
+                output_shape.remove_dimension(axis[i] - i, false);
             }
 
             out = reference::reshape_layer(out, output_shape);
@@ -149,7 +154,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReduceMeanQuantizedFixture : public ReduceMeanValidationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info_input, QuantizationInfo quantization_info_output)
     {
         ReduceMeanValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, keep_dims, quantization_info_input, quantization_info_output);
@@ -160,7 +164,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReduceMeanFixture : public ReduceMeanValidationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims)
     {
         ReduceMeanValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, keep_dims, QuantizationInfo(), QuantizationInfo());
diff --git a/tests/validation/fixtures/ReductionOperationFixture.h b/tests/validation/fixtures/ReductionOperationFixture.h
index f3d653e6dc..b44f299486 100644
--- a/tests/validation/fixtures/ReductionOperationFixture.h
+++ b/tests/validation/fixtures/ReductionOperationFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReductionOperationValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info, bool keep_dims = false)
     {
         const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
@@ -76,14 +75,14 @@ protected:
             if(tensor.data_type() == DataType::QASYMM8)
             {
                 std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-                std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
+                std::uniform_int_distribution<uint32_t> distribution(bounds.first, bounds.second);
 
                 library->fill(tensor, distribution, 0);
             }
             else if(tensor.data_type() == DataType::QASYMM8_SIGNED)
             {
                 std::pair<int, int> bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-                std::uniform_int_distribution<int8_t> distribution(bounds.first, bounds.second);
+                std::uniform_int_distribution<int32_t> distribution(bounds.first, bounds.second);
 
                 library->fill(tensor, distribution, 0);
             }
@@ -108,15 +107,15 @@ protected:
         FunctionType reduction_func;
         reduction_func.configure(&src, &dst, axis, op, _keep_dims);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
@@ -135,7 +134,7 @@ protected:
         // Fill reference
         fill(src);
 
-        return reference::reduction_operation<T, T>(src, dst_shape, axis, op, quantization_info);
+        return reference::reduction_operation<T, T>(src, dst_shape, axis, op, data_type, quantization_info);
     }
 
     TensorType      _target{};
@@ -149,7 +148,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReductionOperationQuantizedFixture : public ReductionOperationValidationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info = QuantizationInfo(), bool keep_dims = false)
     {
         ReductionOperationValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, quantization_info, keep_dims);
@@ -160,7 +158,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReductionOperationFixture : public ReductionOperationValidationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, unsigned int axis, ReductionOperation op, bool keep_dims = false)
     {
         ReductionOperationValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, QuantizationInfo(), keep_dims);
diff --git a/tests/validation/fixtures/RemapFixture.h b/tests/validation/fixtures/RemapFixture.h
deleted file mode 100644
index e851cdb4df..0000000000
--- a/tests/validation/fixtures/RemapFixture.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_REMAP_FIXTURE
-#define ARM_COMPUTE_TEST_REMAP_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Remap.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class RemapValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape shape, InterpolationPolicy policy, DataType data_type, BorderMode border_mode)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution(0, 255);
-        const T                                constant_border_value = static_cast<T>(distribution(gen));
-
-        _target    = compute_target(shape, policy, data_type, border_mode, constant_border_value);
-        _reference = compute_reference(shape, policy, data_type, border_mode, constant_border_value);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i, float min, float max)
-    {
-        std::uniform_int_distribution<> distribution((int)min, (int)max);
-        library->fill(tensor, distribution, i);
-    }
-
-    TensorType compute_target(const TensorShape &shape, InterpolationPolicy policy, DataType data_type, BorderMode border_mode, T constant_border_value)
-    {
-        // Create tensors
-        TensorType src   = create_tensor<TensorType>(shape, data_type);
-        TensorType map_x = create_tensor<TensorType>(shape, DataType::F32);
-        TensorType map_y = create_tensor<TensorType>(shape, DataType::F32);
-        TensorType dst   = create_tensor<TensorType>(shape, data_type);
-
-        // Create and configure function
-        FunctionType remap;
-        remap.configure(&src, &map_x, &map_y, &dst, policy, border_mode, constant_border_value);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(map_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(map_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        map_x.allocator()->allocate();
-        map_y.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!map_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!map_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src), 0, 0, 255);
-        fill(AccessorType(map_x), 1, -5, shape.x() + 5);
-        fill(AccessorType(map_y), 2, -5, shape.y() + 5);
-
-        // Compute function
-        remap.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &shape, InterpolationPolicy policy, DataType data_type, BorderMode border_mode, T constant_border_value)
-    {
-        ARM_COMPUTE_ERROR_ON(data_type != DataType::U8);
-
-        // Create reference
-        SimpleTensor<T>     src{ shape, data_type };
-        SimpleTensor<float> map_x{ shape, DataType::F32 };
-        SimpleTensor<float> map_y{ shape, DataType::F32 };
-
-        // Create the valid mask Tensor
-        _valid_mask = SimpleTensor<T> { shape, data_type };
-
-        // Fill reference
-        fill(src, 0, 0, 255);
-        fill(map_x, 1, -5, shape.x() + 5);
-        fill(map_y, 2, -5, shape.y() + 5);
-
-        // Compute reference
-        return reference::remap<T>(src, map_x, map_y, _valid_mask, policy, border_mode, constant_border_value);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-    SimpleTensor<T> _valid_mask{};
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REMAP_FIXTURE */
diff --git a/tests/validation/fixtures/ReorderFixture.h b/tests/validation/fixtures/ReorderFixture.h
new file mode 100644
index 0000000000..8e28484c48
--- /dev/null
+++ b/tests/validation/fixtures/ReorderFixture.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE_H
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/Reorder.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** [ReorderLayer fixture] **/
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ReorderValidationFixture : public framework::Fixture
+{
+public:
+    void check_hardware_supports(WeightFormat output_wf){
+        if(!Scheduler::get().cpu_info().has_sve() && output_wf!=WeightFormat::OHWIo4){
+            _hardware_supports = false;
+        }
+        if (Scheduler::get().cpu_info().has_sve() && arm_gemm::utils::get_vector_length<float>() != 8 && output_wf==WeightFormat::OHWIo8)
+        {
+            _hardware_supports = false;
+        }
+    }
+
+    void setup(TensorShape input_shape, TensorShape output_shape, WeightFormat input_wf, WeightFormat output_wf, DataType data_type)
+    {
+        check_hardware_supports(output_wf);
+        if (_hardware_supports){
+            _target    = compute_target(input_shape, output_shape, input_wf, output_wf, data_type);
+            _reference = compute_reference(input_shape, output_shape, output_wf, data_type);
+        }
+    }
+
+    protected:
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        library->fill_tensor_uniform(tensor, 0);
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, WeightFormat input_wf, WeightFormat output_wf, DataType data_type)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(input_shape, data_type);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type);
+
+        // Create and configure function
+        FunctionType reorder;
+
+        reorder.configure(&src, &dst, input_wf, output_wf);
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        fill(AccessorType(src));
+
+        // Compute function
+        reorder.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, WeightFormat output_wf, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type };
+
+        // Fill reference
+        fill(src);
+
+        return reference::reorder_layer<T>(src, output_shape, output_wf);
+    }
+
+    bool _hardware_supports = true;
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+/** [ReorderLayer fixture] **/
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE_H
diff --git a/tests/validation/fixtures/ReorgLayerFixture.h b/tests/validation/fixtures/ReorgLayerFixture.h
index 630802214a..f87017190e 100644
--- a/tests/validation/fixtures/ReorgLayerFixture.h
+++ b/tests/validation/fixtures/ReorgLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReorgLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, int32_t stride, DataType data_type, DataLayout data_layout)
     {
         _target    = compute_target(input_shape, stride, data_type, data_layout);
@@ -74,15 +73,15 @@ protected:
 
         reorg.configure(&src, &dst, stride);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
diff --git a/tests/validation/fixtures/ReshapeLayerFixture.h b/tests/validation/fixtures/ReshapeLayerFixture.h
index a89a94741f..5be431f8cf 100644
--- a/tests/validation/fixtures/ReshapeLayerFixture.h
+++ b/tests/validation/fixtures/ReshapeLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_RESHAPE_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_RESHAPE_LAYER_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_RESHAPELAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_RESHAPELAYERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -31,6 +31,7 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/reference/ReshapeLayer.h"
 
 namespace arm_compute
@@ -41,13 +42,12 @@ namespace validation
 {
 /** [ReshapeLayer fixture] **/
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ReshapeLayerValidationFixture : public framework::Fixture
+class ReshapeLayerGenericValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
-    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type, bool add_x_padding = false)
     {
-        _target    = compute_target(input_shape, output_shape, data_type);
+        _target    = compute_target(input_shape, output_shape, data_type, add_x_padding);
         _reference = compute_reference(input_shape, output_shape, data_type);
     }
 
@@ -58,10 +58,10 @@ protected:
         library->fill_tensor_uniform(tensor, i);
     }
 
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type, bool add_x_padding = false)
     {
         // Check if indeed the input shape can be reshape to the output one
-        ARM_COMPUTE_EXPECT(input_shape.total_size() == output_shape.total_size(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(input_shape.total_size() == output_shape.total_size());
 
         // Create tensors
         TensorType src = create_tensor<TensorType>(input_shape, data_type);
@@ -72,15 +72,21 @@ protected:
 
         reshape.configure(&src, &dst);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        if(add_x_padding)
+        {
+            // Add random padding in x dimension
+            add_padding_x({ &src, &dst });
+        }
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
@@ -105,8 +111,27 @@ protected:
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ReshapeLayerValidationFixture : public ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    {
+        ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, output_shape, data_type);
+    }
+};
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ReshapeLayerPaddedValidationFixture : public ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    {
+        ReshapeLayerGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, output_shape, data_type, true /* add_x_padding */);
+    }
+};
 /** [ReshapeLayer fixture] **/
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_RESHAPE_LAYER_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_RESHAPELAYERFIXTURE_H
diff --git a/tests/validation/fixtures/ReverseFixture.h b/tests/validation/fixtures/ReverseFixture.h
index 4982cae578..856bff7b12 100644
--- a/tests/validation/fixtures/ReverseFixture.h
+++ b/tests/validation/fixtures/ReverseFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_REVERSE_FIXTURE
-#define ARM_COMPUTE_TEST_REVERSE_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_REVERSEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_REVERSEFIXTURE_H
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorShape.h"
@@ -45,11 +45,11 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReverseValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
-    void setup(TensorShape shape, TensorShape axis_shape, DataType data_type)
+    void setup(TensorShape shape, TensorShape axis_shape, DataType data_type, bool use_negative_axis = false, bool use_inverted_axis = false)
     {
-        _target    = compute_target(shape, axis_shape, data_type);
-        _reference = compute_reference(shape, axis_shape, data_type);
+        _num_dims  = shape.num_dimensions();
+        _target    = compute_target(shape, axis_shape, data_type, use_negative_axis, use_inverted_axis);
+        _reference = compute_reference(shape, axis_shape, data_type, use_negative_axis, use_inverted_axis);
     }
 
 protected:
@@ -58,16 +58,25 @@ protected:
     {
         library->fill_tensor_uniform(tensor, 0);
     }
-    std::vector<int> generate_random_axis()
+    std::vector<int32_t> generate_random_axis(bool use_negative = false)
     {
-        std::vector<int> axis_v = { 0, 1, 2, 3 };
-        std::mt19937     g(0);
+        std::vector<int32_t> axis_v;
+        if(use_negative)
+        {
+            axis_v = { -1, -2, -3, -4 };
+        }
+        else
+        {
+            axis_v = { 0, 1, 2, 3 };
+        }
+        axis_v = std::vector<int32_t>(axis_v.begin(), axis_v.begin() + _num_dims);
+        std::mt19937 g(library->seed());
         std::shuffle(axis_v.begin(), axis_v.end(), g);
 
         return axis_v;
     }
 
-    TensorType compute_target(const TensorShape &shape, const TensorShape &axis_shape, DataType data_type)
+    TensorType compute_target(const TensorShape &shape, const TensorShape &axis_shape, DataType data_type, bool use_negative_axis, bool use_inverted_axis = false)
     {
         // Create tensors
         TensorType src  = create_tensor<TensorType>(shape, data_type, 1);
@@ -76,27 +85,27 @@ protected:
 
         // Create and configure function
         FunctionType reverse_func;
-        reverse_func.configure(&src, &dst, &axis);
+        reverse_func.configure(&src, &dst, &axis, use_inverted_axis);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(axis.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(axis.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         axis.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!axis.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!axis.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
         {
             auto axis_data = AccessorType(axis);
-            auto axis_v    = generate_random_axis();
-            std::copy(axis_v.begin(), axis_v.begin() + axis_shape.x(), static_cast<int32_t *>(axis_data.data()));
+            auto axis_v    = generate_random_axis(use_negative_axis);
+            std::copy(axis_v.begin(), axis_v.begin() + axis_shape.total_size(), static_cast<int32_t *>(axis_data.data()));
         }
 
         // Compute function
@@ -105,24 +114,25 @@ protected:
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape, const TensorShape &axis_shape, DataType data_type)
+    SimpleTensor<T> compute_reference(const TensorShape &shape, const TensorShape &axis_shape, DataType data_type, bool use_negative_axis, bool use_inverted_axis = false)
     {
         // Create reference
-        SimpleTensor<T>        src{ shape, data_type };
-        SimpleTensor<uint32_t> axis{ axis_shape, DataType::U32 };
+        SimpleTensor<T>       src{ shape, data_type };
+        SimpleTensor<int32_t> axis{ axis_shape, DataType::S32 };
 
         // Fill reference
         fill(src);
-        auto axis_v = generate_random_axis();
-        std::copy(axis_v.begin(), axis_v.begin() + axis_shape.x(), axis.data());
+        auto axis_v = generate_random_axis(use_negative_axis);
+        std::copy(axis_v.begin(), axis_v.begin() + axis_shape.total_size(), axis.data());
 
-        return reference::reverse<T>(src, axis);
+        return reference::reverse<T>(src, axis, use_inverted_axis);
     }
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
+    unsigned int    _num_dims{};
 };
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REVERSE_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_REVERSEFIXTURE_H
diff --git a/tests/validation/fixtures/ScaleFixture.h b/tests/validation/fixtures/ScaleFixture.h
index dd521470e6..86d89d71f7 100644
--- a/tests/validation/fixtures/ScaleFixture.h
+++ b/tests/validation/fixtures/ScaleFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_SCALE_FIXTURE
-#define ARM_COMPUTE_TEST_SCALE_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_SCALEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_SCALEFIXTURE_H
+
+#include "tests/framework/Asserts.h" // Required for ARM_COMPUTE_ASSERT
 #include "tests/framework/Fixture.h"
 #include "tests/validation/reference/Permute.h"
 #include "tests/validation/reference/Scale.h"
@@ -44,22 +39,23 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ScaleValidationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy,
-               bool align_corners)
+               bool align_corners, bool mixed_layout, QuantizationInfo output_quantization_info)
     {
-        _shape             = shape;
-        _policy            = policy;
-        _border_mode       = border_mode;
-        _sampling_policy   = sampling_policy;
-        _data_type         = data_type;
-        _quantization_info = quantization_info;
-        _align_corners     = align_corners;
+        _shape                    = shape;
+        _policy                   = policy;
+        _border_mode              = border_mode;
+        _sampling_policy          = sampling_policy;
+        _data_type                = data_type;
+        _input_quantization_info  = quantization_info;
+        _output_quantization_info = output_quantization_info;
+        _align_corners            = align_corners;
+        _mixed_layout             = mixed_layout;
 
         generate_scale(shape);
 
-        std::mt19937                           generator(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
+        std::mt19937                            generator(library->seed());
+        std::uniform_int_distribution<uint32_t> distribution_u8(0, 255);
         _constant_border_value = static_cast<T>(distribution_u8(generator));
 
         _target    = compute_target(shape, data_layout);
@@ -67,6 +63,21 @@ public:
     }
 
 protected:
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
+    {
+        const DataLayout data_layout = src.info()->data_layout();
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+
+        // Compute Convolution function
+        layer.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(data_layout);
+        dst.info()->set_data_layout(data_layout);
+    }
+
     void generate_scale(const TensorShape &shape)
     {
         static constexpr float _min_scale{ 0.25f };
@@ -128,7 +139,7 @@ protected:
         }
 
         // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, _data_type, 1, _quantization_info, data_layout);
+        TensorType src = create_tensor<TensorType>(shape, _data_type, 1, _input_quantization_info, data_layout);
 
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -136,40 +147,48 @@ protected:
         TensorShape shape_scaled(shape);
         shape_scaled.set(idx_width, shape[idx_width] * _scale_x, /* apply_dim_correction = */ false);
         shape_scaled.set(idx_height, shape[idx_height] * _scale_y, /* apply_dim_correction = */ false);
-        TensorType dst = create_tensor<TensorType>(shape_scaled, _data_type, 1, _quantization_info, data_layout);
+        TensorType dst = create_tensor<TensorType>(shape_scaled, _data_type, 1, _output_quantization_info, data_layout);
 
         // Create and configure function
         FunctionType scale;
 
         scale.configure(&src, &dst, ScaleKernelInfo{ _policy, _border_mode, _constant_border_value, _sampling_policy, /* use_padding */ false, _align_corners });
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        add_padding_x({ &src, &dst }, data_layout);
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
 
-        // Compute function
-        scale.run();
-
+        if(_mixed_layout)
+        {
+            mix_layout(scale, src, dst);
+        }
+        else
+        {
+            // Compute function
+            scale.run();
+        }
         return dst;
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &shape)
     {
         // Create reference
-        SimpleTensor<T> src{ shape, _data_type, 1, _quantization_info };
+        SimpleTensor<T> src{ shape, _data_type, 1, _input_quantization_info };
 
         // Fill reference
         fill(src);
 
-        return reference::scale<T>(src, _scale_x, _scale_y, _policy, _border_mode, _constant_border_value, _sampling_policy, /* ceil_policy_scale */ false, _align_corners);
+        return reference::scale<T>(src, _scale_x, _scale_y, _policy, _border_mode, _constant_border_value, _sampling_policy, /* ceil_policy_scale */ false, _align_corners, _output_quantization_info);
     }
 
     TensorType          _target{};
@@ -180,17 +199,18 @@ protected:
     T                   _constant_border_value{};
     SamplingPolicy      _sampling_policy{};
     DataType            _data_type{};
-    QuantizationInfo    _quantization_info{};
+    QuantizationInfo    _input_quantization_info{};
+    QuantizationInfo    _output_quantization_info{};
     bool                _align_corners{ false };
+    bool                _mixed_layout{ false };
     float               _scale_x{ 1.f };
     float               _scale_y{ 1.f };
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class ScaleValidationQuantizedFixture : public ScaleValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy,
                bool align_corners)
     {
@@ -201,14 +221,35 @@ public:
                                                                                         policy,
                                                                                         border_mode,
                                                                                         sampling_policy,
-                                                                                        align_corners);
+                                                                                        align_corners,
+                                                                                        mixed_layout,
+                                                                                        quantization_info);
     }
 };
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
+class ScaleValidationDifferentOutputQuantizedFixture : public ScaleValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape shape, DataType data_type, QuantizationInfo input_quantization_info, QuantizationInfo output_quantization_info, DataLayout data_layout, InterpolationPolicy policy,
+               BorderMode border_mode, SamplingPolicy sampling_policy,
+               bool align_corners)
+    {
+        ScaleValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape,
+                                                                                        data_type,
+                                                                                        input_quantization_info,
+                                                                                        data_layout,
+                                                                                        policy,
+                                                                                        border_mode,
+                                                                                        sampling_policy,
+                                                                                        align_corners,
+                                                                                        mixed_layout,
+                                                                                        output_quantization_info);
+    }
+};
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class ScaleValidationFixture : public ScaleValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, DataLayout data_layout, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy, bool align_corners)
     {
         ScaleValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape,
@@ -218,10 +259,12 @@ public:
                                                                                         policy,
                                                                                         border_mode,
                                                                                         sampling_policy,
-                                                                                        align_corners);
+                                                                                        align_corners,
+                                                                                        mixed_layout,
+                                                                                        QuantizationInfo());
     }
 };
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SCALE_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_SCALEFIXTURE_H
diff --git a/tests/validation/fixtures/ScatterLayerFixture.h b/tests/validation/fixtures/ScatterLayerFixture.h
new file mode 100644
index 0000000000..af161ef98b
--- /dev/null
+++ b/tests/validation/fixtures/ScatterLayerFixture.h
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_SCATTERLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_SCATTERLAYERFIXTURE_H
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "tests/Globals.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/ScatterLayer.h"
+#include "tests/SimpleTensor.h"
+
+#include <random>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ScatterGenericValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape src_shape, TensorShape updates_shape, TensorShape indices_shape,
+        TensorShape out_shape, DataType data_type, ScatterInfo scatter_info, bool inplace, bool padding,
+        QuantizationInfo src_qinfo = QuantizationInfo(), QuantizationInfo o_qinfo = QuantizationInfo())
+    {
+        // this is for improving randomness across tests
+        _hash = src_shape[0] + src_shape[1] + src_shape[2] + src_shape[3] + src_shape[4] + src_shape[5]
+              + updates_shape[0] + updates_shape[1] + updates_shape[2] + updates_shape[3]
+              + updates_shape[4] + updates_shape[5]
+              + indices_shape[0] + indices_shape[1] + indices_shape[2] + indices_shape[3];
+
+        _target    = compute_target(src_shape, updates_shape, indices_shape,  out_shape, data_type, scatter_info, inplace, padding, src_qinfo, o_qinfo);
+        _reference = compute_reference(src_shape, updates_shape, indices_shape,  out_shape, data_type,scatter_info, src_qinfo , o_qinfo);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F32:
+            case DataType::F16:
+            {
+                std::uniform_real_distribution<float> distribution(-10.f, 10.f);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::S32:
+            case DataType::S16:
+            case DataType::S8:
+            {
+                std::uniform_int_distribution<int32_t> distribution(-100, 100);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::U32:
+            case DataType::U16:
+            case DataType::U8:
+            {
+                std::uniform_int_distribution<uint32_t> distribution(0, 200);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported data type.");
+            }
+        }
+    }
+
+    // This is used to fill indices tensor with S32 datatype.
+    // Used to prevent ONLY having values that are out of bounds.
+    template <typename U>
+    void fill_indices(U &&tensor, int i, const TensorShape &shape)
+    {
+        // Calculate max indices the shape should contain. Add an arbitrary value to allow testing for some out of bounds values (In this case min dimension)
+        const int32_t max = std::min({shape[0] , shape[1], shape[2]}) + 1;
+        library->fill_tensor_uniform(tensor, i, static_cast<int32_t>(0), static_cast<int32_t>(max));
+    }
+
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c,
+        const TensorShape &out_shape, DataType data_type, const ScatterInfo info, bool inplace, bool padding,
+        QuantizationInfo a_qinfo, QuantizationInfo o_qinfo)
+    {
+        // 1. Create relevant tensors using ScatterInfo data structure.
+        // ----------------------------------------------------
+        // In order - src, updates, indices, output.
+        TensorType src   = create_tensor<TensorType>(shape_a, data_type, 1, a_qinfo);
+        TensorType updates   = create_tensor<TensorType>(shape_b, data_type, 1, a_qinfo);
+        TensorType indices   = create_tensor<TensorType>(shape_c, DataType::S32, 1, QuantizationInfo());
+        TensorType dst = create_tensor<TensorType>(out_shape, data_type, 1, o_qinfo);
+
+        FunctionType scatter;
+
+        // Configure operator
+        // When scatter_info.zero_initialization is true, pass nullptr for src
+        // because dst does not need to be initialized with src values.
+        if(info.zero_initialization)
+        {
+            scatter.configure(nullptr, &updates, &indices, &dst, info);
+        }
+        else
+        {
+            if(inplace)
+            {
+                scatter.configure(&src, &updates, &indices, &src, info);
+            }
+            else
+            {
+                scatter.configure(&src, &updates, &indices, &dst, info);
+            }
+        }
+
+        // Assertions
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(updates.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(indices.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        if(padding)
+        {
+            add_padding_x({ &src, &updates, &indices});
+
+            if(!inplace)
+            {
+                add_padding_x({ &dst });
+            }
+        }
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        updates.allocator()->allocate();
+        indices.allocator()->allocate();
+
+        if(!inplace)
+        {
+            dst.allocator()->allocate();
+        }
+
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!updates.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!indices.info()->is_resizable());
+
+        if(!inplace)
+        {
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        }
+
+        // Fill update (a) and indices (b) tensors.
+        fill(AccessorType(src), 0 + _hash);
+        fill(AccessorType(updates), 1+ _hash);
+        fill_indices(AccessorType(indices), 2 + _hash, out_shape);
+
+        scatter.run();
+
+        if(inplace)
+        {
+            return src;
+        }
+        else
+        {
+            return dst;
+        }
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &a_shape, const TensorShape &b_shape, const TensorShape &c_shape,
+        const TensorShape &out_shape, DataType data_type, ScatterInfo info, QuantizationInfo a_qinfo, QuantizationInfo o_qinfo)
+    {
+        // Output Quantization not currently in use - fixture should be extended to support this.
+        ARM_COMPUTE_UNUSED(o_qinfo);
+        TensorShape src_shape = a_shape;
+        TensorShape updates_shape = b_shape;
+        TensorShape indices_shape = c_shape;
+        const int num_ind_dims = c_shape.num_dimensions();
+
+        // 1. Collapse batch index into a single dim if necessary for update tensor and indices tensor.
+        if(num_ind_dims >= 3)
+        {
+            indices_shape = indices_shape.collapsed_from(1);
+            updates_shape = updates_shape.collapsed_from(updates_shape.num_dimensions() - (num_ind_dims -1)); // Collapses batch dims
+        }
+
+        // 2. Collapse data dims into a single dim.
+        //    Collapse all src dims into 2 dims. First one holding data, the other being the index we iterate over.
+        src_shape.collapse(updates_shape.num_dimensions() - 1);     // Collapse all data dims into single dim.
+        src_shape = src_shape.collapsed_from(1);                    // Collapse all index dims into a single dim
+        updates_shape.collapse(updates_shape.num_dimensions() - 1); // Collapse data dims (all except last dim which is batch dim)
+
+        // Create reference tensors
+        SimpleTensor<T> src{ src_shape, data_type, 1, a_qinfo };
+        SimpleTensor<T> updates{updates_shape, data_type, 1, QuantizationInfo() };
+        SimpleTensor<int32_t> indices{ indices_shape, DataType::S32, 1, QuantizationInfo() };
+
+        // Fill reference
+        fill(src, 0 + _hash);
+        fill(updates, 1 + _hash);
+        fill_indices(indices, 2 + _hash, out_shape);
+
+        // Calculate individual reference using collapsed shapes
+        return reference::scatter_layer<T>(src, updates, indices, out_shape, info);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+    int32_t _hash{};
+};
+
+// This fixture will use the same shape for updates as indices.
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ScatterValidationFixture : public ScatterGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape src_shape, TensorShape update_shape, TensorShape indices_shape,
+        TensorShape out_shape, DataType data_type, ScatterFunction func, bool zero_init, bool inplace, bool padding)
+    {
+        ScatterGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, update_shape,
+            indices_shape, out_shape, data_type, ScatterInfo(func, zero_init), inplace, padding,
+            QuantizationInfo(), QuantizationInfo());
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_SCATTERLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/ScharrFixture.h b/tests/validation/fixtures/ScharrFixture.h
index 204ffc6767..b54a9d29e6 100644
--- a/tests/validation/fixtures/ScharrFixture.h
+++ b/tests/validation/fixtures/ScharrFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ScharrValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, BorderMode border_mode, Format format, GradientDimension gradient_dimension)
     {
         // Generate a random constant value
@@ -120,18 +119,18 @@ protected:
                 ARM_COMPUTE_ERROR("Gradient dimension not supported");
         }
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst_x.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst_y.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst_x.allocator()->allocate();
         dst_y.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst_x.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst_y.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/SelectFixture.h b/tests/validation/fixtures/SelectFixture.h
index 96a7c8666a..8cb6f062f9 100644
--- a/tests/validation/fixtures/SelectFixture.h
+++ b/tests/validation/fixtures/SelectFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,7 +63,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class SelectValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, bool has_same_same_rank, DataType data_type)
     {
         TensorShape condition_shape = detail::select_condition_shape(shape, has_same_same_rank);
@@ -97,10 +96,10 @@ protected:
         FunctionType select;
         select.configure(&c_t, &x_t, &y_t, &dst_t);
 
-        ARM_COMPUTE_EXPECT(c_t.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(x_t.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(y_t.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst_t.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(c_t.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(x_t.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(y_t.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst_t.info()->is_resizable());
 
         // Allocate tensors
         c_t.allocator()->allocate();
@@ -108,10 +107,10 @@ protected:
         y_t.allocator()->allocate();
         dst_t.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!c_t.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!x_t.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!y_t.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst_t.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!c_t.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!x_t.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!y_t.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst_t.info()->is_resizable());
 
         // Fill tensors
         fill_bool(AccessorType(c_t), 0);
diff --git a/tests/validation/fixtures/SliceOperationsFixtures.h b/tests/validation/fixtures/SliceOperationsFixtures.h
index c1e046e427..b1f91ea2e0 100644
--- a/tests/validation/fixtures/SliceOperationsFixtures.h
+++ b/tests/validation/fixtures/SliceOperationsFixtures.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,6 @@
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
-#include "tests/RawLutAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
@@ -46,7 +45,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class SliceFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, Coordinates starts, Coordinates ends, DataType data_type)
     {
         _target    = compute_target(shape, starts, ends, data_type);
@@ -70,15 +68,15 @@ protected:
         FunctionType slice;
         slice.configure(&src, &dst, starts, ends);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
@@ -109,7 +107,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class StridedSliceFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape,
                Coordinates starts, Coordinates ends, BiStrides strides,
                int32_t begin_mask, int32_t end_mask, int32_t shrink_mask,
@@ -139,15 +136,15 @@ protected:
         FunctionType strided_slice;
         strided_slice.configure(&src, &dst, starts, ends, strides, begin_mask, end_mask, shrink_mask);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
diff --git a/tests/validation/fixtures/SoftmaxLayerFixture.h b/tests/validation/fixtures/SoftmaxLayerFixture.h
index c39ab740cd..f4bf8df9c0 100644
--- a/tests/validation/fixtures/SoftmaxLayerFixture.h
+++ b/tests/validation/fixtures/SoftmaxLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class SoftmaxValidationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info, float beta, size_t axis)
     {
         _quantization_info = quantization_info;
@@ -91,15 +90,15 @@ protected:
         FunctionType smx_layer;
         smx_layer.configure(&src, &dst, beta, axis);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
@@ -131,7 +130,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class SoftmaxValidationFixture : public SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T, IS_LOG>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, float beta, size_t axis)
     {
         SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T, IS_LOG>::setup(shape,
@@ -146,7 +144,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class SoftmaxValidationQuantizedFixture : public SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T, IS_LOG>
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info, float beta, size_t axis)
     {
         SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T, IS_LOG>::setup(shape,
diff --git a/tests/validation/fixtures/SpaceToBatchFixture.h b/tests/validation/fixtures/SpaceToBatchFixture.h
index c4076e6ed6..964e511301 100644
--- a/tests/validation/fixtures/SpaceToBatchFixture.h
+++ b/tests/validation/fixtures/SpaceToBatchFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class SpaceToBatchLayerValidationGenericFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape block_shape_shape, TensorShape paddings_shape, TensorShape output_shape,
                DataType data_type, DataLayout data_layout, QuantizationInfo quantization_info)
     {
@@ -79,10 +78,10 @@ protected:
         FunctionType space_to_batch;
         space_to_batch.configure(&input, &block_shape, &paddings, &output);
 
-        ARM_COMPUTE_EXPECT(input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(block_shape.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(paddings.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(block_shape.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(paddings.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(output.info()->is_resizable());
 
         // Allocate tensors
         input.allocator()->allocate();
@@ -90,10 +89,10 @@ protected:
         paddings.allocator()->allocate();
         output.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!block_shape.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!paddings.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!block_shape.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!paddings.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!output.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(input), 0);
@@ -140,7 +139,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class SpaceToBatchLayerValidationFixture : public SpaceToBatchLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape block_shape_shape, TensorShape paddings_shape, TensorShape output_shape,
                DataType data_type, DataLayout data_layout)
     {
@@ -152,7 +150,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class SpaceToBatchLayerValidationQuantizedFixture : public SpaceToBatchLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape block_shape_shape, TensorShape paddings_shape, TensorShape output_shape,
                DataType data_type, DataLayout data_layout, QuantizationInfo quantization_info)
     {
diff --git a/tests/validation/fixtures/SpaceToDepthFixture.h b/tests/validation/fixtures/SpaceToDepthFixture.h
index 45ea34b0f5..2d2e9fad7d 100644
--- a/tests/validation/fixtures/SpaceToDepthFixture.h
+++ b/tests/validation/fixtures/SpaceToDepthFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_TEST_SPACE_TO_DEPTH_LAYER_FIXTURE
 #define ARM_COMPUTE_TEST_SPACE_TO_DEPTH_LAYER_FIXTURE
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/Globals.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
@@ -39,7 +40,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class SpaceToDepthLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape output_shape, const int block_shape, DataType data_type, DataLayout data_layout)
     {
         _target    = compute_target(input_shape, output_shape, block_shape, data_type, data_layout);
@@ -69,19 +69,25 @@ protected:
         TensorType input  = create_tensor<TensorType>(input_shape, data_type, 1, QuantizationInfo(), data_layout);
         TensorType output = create_tensor<TensorType>(output_shape, data_type, 1, QuantizationInfo(), data_layout);
 
+        auto calc_out_shape = misc::shape_calculator::compute_space_to_depth_shape(input.info(), block_shape);
+        ARM_COMPUTE_ASSERT(output_shape[0] == calc_out_shape[0]);
+        ARM_COMPUTE_ASSERT(output_shape[1] == calc_out_shape[1]);
+        ARM_COMPUTE_ASSERT(output_shape[2] == calc_out_shape[2]);
+        ARM_COMPUTE_ASSERT(output_shape[3] == calc_out_shape[3]);
+
         // Create and configure function
         FunctionType space_to_depth;
         space_to_depth.configure(&input, &output, block_shape);
 
-        ARM_COMPUTE_EXPECT(input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(output.info()->is_resizable());
 
         // Allocate tensors
         input.allocator()->allocate();
         output.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!input.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!output.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!output.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(input), 0);
diff --git a/tests/validation/fixtures/SplitFixture.h b/tests/validation/fixtures/SplitFixture.h
index 03ff41e993..203925329c 100644
--- a/tests/validation/fixtures/SplitFixture.h
+++ b/tests/validation/fixtures/SplitFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,6 @@
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
-#include "tests/RawLutAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
@@ -48,7 +47,6 @@ template <typename TensorType, typename ITensorType, typename AccessorType, type
 class SplitFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, unsigned int axis, unsigned int splits, DataType data_type)
     {
         _target    = compute_target(shape, axis, splits, data_type);
@@ -77,7 +75,7 @@ protected:
         FunctionType split;
         split.configure(&src, dsts_ptr, axis);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
         ARM_COMPUTE_EXPECT(std::all_of(dsts.cbegin(), dsts.cend(), [](const TensorType & t)
         {
             return t.info()->is_resizable();
@@ -91,7 +89,7 @@ protected:
             dsts[i].allocator()->allocate();
         }
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
         ARM_COMPUTE_EXPECT(std::all_of(dsts.cbegin(), dsts.cend(), [](const TensorType & t)
         {
             return !t.info()->is_resizable();
@@ -150,7 +148,6 @@ template <typename TensorType, typename ITensorType, typename AccessorType, type
 class SplitShapesFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, unsigned int axis, std::vector<TensorShape> split_shapes, DataType data_type)
     {
         _target    = compute_target(shape, axis, split_shapes, data_type);
@@ -186,7 +183,7 @@ protected:
         FunctionType split;
         split.configure(&src, dsts_ptr, axis);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
         ARM_COMPUTE_EXPECT(std::all_of(dsts.cbegin(), dsts.cend(), [](const TensorType & t)
         {
             return t.info()->is_resizable();
@@ -200,7 +197,7 @@ protected:
             dsts[i].allocator()->allocate();
         }
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
         ARM_COMPUTE_EXPECT(std::all_of(dsts.cbegin(), dsts.cend(), [](const TensorType & t)
         {
             return !t.info()->is_resizable();
diff --git a/tests/validation/fixtures/StackLayerFixture.h b/tests/validation/fixtures/StackLayerFixture.h
index 7bf63a3ebc..7dd8fe47dc 100644
--- a/tests/validation/fixtures/StackLayerFixture.h
+++ b/tests/validation/fixtures/StackLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_STACK_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_STACK_LAYER_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_STACKLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_STACKLAYERFIXTURE_H
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorShape.h"
@@ -52,10 +52,9 @@ template <typename TensorType, typename AbstractTensorType, typename AccessorTyp
 class StackLayerValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape_src, int axis, DataType data_type, int num_tensors)
     {
-        _target    = compute_target(shape_src, axis, data_type, num_tensors);
+        _target    = compute_target(shape_src, axis, data_type, num_tensors, false /* add_x_padding */);
         _reference = compute_reference(shape_src, axis, data_type, num_tensors);
     }
 
@@ -66,7 +65,7 @@ protected:
         library->fill_tensor_uniform(tensor, i);
     }
 
-    TensorType compute_target(TensorShape shape_src, int axis, DataType data_type, int num_tensors)
+    TensorType compute_target(TensorShape shape_src, int axis, DataType data_type, int num_tensors, bool add_x_padding)
     {
         std::vector<TensorType>           tensors(num_tensors);
         std::vector<AbstractTensorType *> src(num_tensors);
@@ -76,7 +75,7 @@ protected:
         {
             tensors[i] = create_tensor<TensorType>(shape_src, data_type);
             src[i]     = &(tensors[i]);
-            ARM_COMPUTE_EXPECT(tensors[i].info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(tensors[i].info()->is_resizable());
         }
 
         // Create tensors
@@ -91,18 +90,28 @@ protected:
         // Allocate and fill the input tensors
         for(int i = 0; i < num_tensors; ++i)
         {
-            ARM_COMPUTE_EXPECT(tensors[i].info()->is_resizable(), framework::LogLevel::ERRORS);
+            if(add_x_padding)
+            {
+                add_padding_x({&tensors[i]}, DataLayout::NHWC);
+            }
+
+            ARM_COMPUTE_ASSERT(tensors[i].info()->is_resizable());
             tensors[i].allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!tensors[i].info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!tensors[i].info()->is_resizable());
 
             // Fill input tensor
             fill(AccessorType(tensors[i]), i);
         }
 
+        if(add_x_padding)
+        {
+            add_padding_x({&dst}, DataLayout::NHWC);
+        }
+
         // Allocate output tensor
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Compute stack function
         stack.run();
@@ -132,7 +141,21 @@ protected:
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
+
+template <typename TensorType, typename AbstractTensorType, typename AccessorType, typename FunctionType, typename T>
+class StackLayerWithPaddingValidationFixture :
+    public StackLayerValidationFixture<TensorType, AbstractTensorType, AccessorType, FunctionType, T>
+{
+public:
+    using Parent = StackLayerValidationFixture<TensorType, AbstractTensorType, AccessorType, FunctionType, T>;
+
+    void setup(TensorShape shape_src, int axis, DataType data_type, int num_tensors)
+    {
+        Parent::_target    = Parent::compute_target(shape_src, axis, data_type, num_tensors, true /* add_x_padding */);
+        Parent::_reference = Parent::compute_reference(shape_src, axis, data_type, num_tensors);
+    }
+};
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_STACK_LAYER_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_STACKLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/TileFixture.h b/tests/validation/fixtures/TileFixture.h
index 0dfcc330f3..979eee5ab1 100644
--- a/tests/validation/fixtures/TileFixture.h
+++ b/tests/validation/fixtures/TileFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class TileValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type, const Multiples &multiples)
     {
         _target    = compute_target(shape, data_type, multiples);
@@ -68,15 +67,15 @@ protected:
         FunctionType tile_func;
         tile_func.configure(&src, &dst, multiples);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
diff --git a/tests/validation/fixtures/TransposeFixture.h b/tests/validation/fixtures/TransposeFixture.h
index 757e6c3c07..212c76cc9a 100644
--- a/tests/validation/fixtures/TransposeFixture.h
+++ b/tests/validation/fixtures/TransposeFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_TRANSPOSE_FIXTURE
-#define ARM_COMPUTE_TEST_TRANSPOSE_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_TRANSPOSEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_TRANSPOSEFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -32,7 +32,7 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
-#include "tests/validation/reference/Transpose.h"
+#include "tests/validation/reference/Permute.h"
 
 namespace arm_compute
 {
@@ -44,7 +44,6 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class TransposeValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
         _target    = compute_target(shape, data_type);
@@ -71,15 +70,15 @@ protected:
         FunctionType trans_func;
         trans_func.configure(&src, &dst);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src));
@@ -98,7 +97,7 @@ protected:
         // Fill reference
         fill(src);
 
-        return reference::transpose<T>(src);
+        return reference::permute<T>(src, PermutationVector(1U, 0U));
     }
 
     TensorType      _target{};
@@ -107,4 +106,4 @@ protected:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_TRANSPOSE_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_TRANSPOSEFIXTURE_H
diff --git a/tests/validation/fixtures/UNIT/ContextFixture.h b/tests/validation/fixtures/UNIT/ContextFixture.h
new file mode 100644
index 0000000000..77cbc12320
--- /dev/null
+++ b/tests/validation/fixtures/UNIT/ContextFixture.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_UNIT_CONTEXT_FIXTURE
+#define ARM_COMPUTE_TEST_UNIT_CONTEXT_FIXTURE
+
+#include "arm_compute/Acl.hpp"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Test-case for AclDestroyContext
+ *
+ * Validate that AclDestroyContext behaves as expected when invalid inputs as context are given
+ *
+ * Test Steps:
+ *  - Call AclDestroyContext with null context
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Call AclDestroyContext on empty array
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Call AclDestroyContext on an ACL object other than AclContext
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that context is still nullptr
+ */
+template <AclTarget Target>
+class DestroyInvalidContextFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        AclContext ctx = nullptr;
+        std::array<char, 256> empty_array{};
+        AclContext valid_ctx = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateContext(&valid_ctx, Target, nullptr) == AclStatus::AclSuccess);
+        ARM_COMPUTE_ASSERT(AclDestroyContext(ctx) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyContext(reinterpret_cast<AclContext>(empty_array.data())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(ctx == nullptr);
+        ARM_COMPUTE_ASSERT(AclDestroyContext(valid_ctx) == AclStatus::AclSuccess);
+    };
+};
+
+/** Test-case for AclCreateContext and AclDestroyContext
+ *
+ * Validate that AclCreateContext can create and destroy a context through the C API
+ *
+ * Test Steps:
+ *  - Call AclCreateContext with valid target
+ *  - Confirm that context is not nullptr and error code is AclSuccess
+ *  - Destroy context
+ *  - Confirm that AclSuccess is reported
+ */
+template <AclTarget Target>
+class SimpleContextCApiFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        AclContext ctx = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateContext(&ctx, Target, nullptr) == AclStatus::AclSuccess);
+        ARM_COMPUTE_ASSERT(ctx != nullptr);
+        ARM_COMPUTE_ASSERT(AclDestroyContext(ctx) == AclStatus::AclSuccess);
+    };
+};
+
+/** Test-case for Context from the C++ interface
+ *
+ * Test Steps:
+ *  - Create a Context obejct
+ *  - Confirm that StatusCode::Success is reported
+ *  - Confirm that equality operator works
+ *  - Confirm that inequality operator works
+ */
+template <acl::Target Target>
+class SimpleContextCppApiFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode status = acl::StatusCode::Success;
+        acl::Context    ctx(Target, &status);
+        ARM_COMPUTE_ASSERT(status == acl::StatusCode::Success);
+
+        auto ctx_eq = ctx;
+        ARM_COMPUTE_ASSERT(ctx_eq == ctx);
+
+        acl::Context ctx_ienq(Target, &status);
+        ARM_COMPUTE_ASSERT(status == acl::StatusCode::Success);
+        ARM_COMPUTE_ASSERT(ctx_ienq != ctx);
+    };
+};
+
+/** Test-case for multiple contexes
+ *
+ * Validate that AclCreateContext can create/destroy multiple contexts with different options
+ *
+ * Test Steps:
+ *  - Call AclCreateContext with different targets
+ *  - Confirm that AclSuccess is reported
+ *  - Destroy all contexts
+ *  - Confirm that AclSuccess is reported
+ */
+template <AclTarget Target>
+class MultipleContextsFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        const unsigned int num_tests = 5;
+        std::array<AclContext, num_tests> ctxs{};
+        for(unsigned int i = 0; i < num_tests; ++i)
+        {
+            ARM_COMPUTE_ASSERT(AclCreateContext(&ctxs[i], Target, nullptr) == AclStatus::AclSuccess);
+            ARM_COMPUTE_ASSERT(ctxs[i] != nullptr);
+            ARM_COMPUTE_ASSERT(AclDestroyContext(ctxs[i]) == AclStatus::AclSuccess);
+        }
+    };
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_UNIT_CONTEXT_FIXTURE */
diff --git a/tests/validation/fixtures/UNIT/DynamicTensorFixture.h b/tests/validation/fixtures/UNIT/DynamicTensorFixture.h
index c3aa63b31b..3e96dcbf2d 100644
--- a/tests/validation/fixtures/UNIT/DynamicTensorFixture.h
+++ b/tests/validation/fixtures/UNIT/DynamicTensorFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,14 +74,14 @@ public:
 
     void validate(bool validate_finalized) const
     {
-        ARM_COMPUTE_EXPECT(mm->pool_manager() != nullptr, framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(mm->lifetime_manager() != nullptr, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(mm->pool_manager() != nullptr);
+        ARM_COMPUTE_ASSERT(mm->lifetime_manager() != nullptr);
 
         if(validate_finalized)
         {
-            ARM_COMPUTE_EXPECT(mm->lifetime_manager()->are_all_finalized(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(mm->lifetime_manager()->are_all_finalized());
         }
-        ARM_COMPUTE_EXPECT(mm->pool_manager()->num_pools() == num_pools, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(mm->pool_manager()->num_pools() == num_pools);
     }
 
     AllocatorType                    allocator;
@@ -127,7 +127,6 @@ class DynamicTensorType3SingleFunction : public framework::Fixture
     using T = float;
 
 public:
-    template <typename...>
     void setup(TensorShape input_level0, TensorShape input_level1)
     {
         input_l0 = input_level0;
@@ -159,15 +158,15 @@ protected:
         SimpleFunctionWrapperType layer(serv_internal.mm);
         layer.configure(&src, &dst);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Populate and validate memory manager
         serv_cross.populate(num_pools);
@@ -251,7 +250,6 @@ class DynamicTensorType3ComplexFunction : public framework::Fixture
     using T = float;
 
 public:
-    template <typename...>
     void setup(std::vector<TensorShape> input_shapes, TensorShape weights_shape, TensorShape bias_shape, std::vector<TensorShape> output_shapes, PadStrideInfo info)
     {
         num_iterations = input_shapes.size();
@@ -313,8 +311,8 @@ protected:
         // Create and configure function
         _f_target->configure(&src, &_weights_target, &_bias_target, &dst, info, weights_info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -322,8 +320,8 @@ protected:
         _weights_target.allocator()->allocate();
         _bias_target.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
@@ -390,7 +388,6 @@ class DynamicTensorType2PipelineFunction : public framework::Fixture
     using T = float;
 
 public:
-    template <typename...>
     void setup(std::vector<TensorShape> input_shapes)
     {
         _data_type    = DataType::F32;
diff --git a/tests/validation/fixtures/UNIT/MemoryManagerFixture.h b/tests/validation/fixtures/UNIT/MemoryManagerFixture.h
index 14f22a8d21..3bc4844bd5 100644
--- a/tests/validation/fixtures/UNIT/MemoryManagerFixture.h
+++ b/tests/validation/fixtures/UNIT/MemoryManagerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,8 +100,8 @@ protected:
 
         // Finalize memory manager
         mm->populate(_allocator, 1 /* num_pools */);
-        ARM_COMPUTE_EXPECT(mm->lifetime_manager()->are_all_finalized(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(mm->pool_manager()->num_pools() == 1, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(mm->lifetime_manager()->are_all_finalized());
+        ARM_COMPUTE_ASSERT(mm->pool_manager()->num_pools() == 1);
 
         // Fill tensors
         fill(AccessorType(src), 0);
@@ -206,8 +206,8 @@ protected:
 
         // Finalize memory manager
         mm->populate(_allocator, 1 /* num_pools */);
-        ARM_COMPUTE_EXPECT(mm->lifetime_manager()->are_all_finalized(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(mm->pool_manager()->num_pools() == 1, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(mm->lifetime_manager()->are_all_finalized());
+        ARM_COMPUTE_ASSERT(mm->pool_manager()->num_pools() == 1);
 
         // Fill tensors (1st iteration)
         fill(AccessorType(src), 0);
@@ -340,8 +340,8 @@ protected:
 
         // Finalize memory manager
         mm->populate(_allocator, 1 /* num_pools */);
-        ARM_COMPUTE_EXPECT(mm->lifetime_manager()->are_all_finalized(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(mm->pool_manager()->num_pools() == 1, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(mm->lifetime_manager()->are_all_finalized());
+        ARM_COMPUTE_ASSERT(mm->pool_manager()->num_pools() == 1);
 
         // Fill tensors (1st iteration)
         fill(AccessorType(src), 0);
diff --git a/tests/validation/fixtures/UNIT/QueueFixture.h b/tests/validation/fixtures/UNIT/QueueFixture.h
new file mode 100644
index 0000000000..bc93f5f120
--- /dev/null
+++ b/tests/validation/fixtures/UNIT/QueueFixture.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_UNIT_QUEUE_FIXTURE
+#define ARM_COMPUTE_TEST_UNIT_QUEUE_FIXTURE
+
+#include "arm_compute/Acl.hpp"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Test case for AclCreateQueue
+ *
+ * Validate that AclCreateQueue behaves as expected with invalid context
+ *
+ * Test Steps:
+ *  - Call AclCreateQueue with an invalid context
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that the queue is still nullptr
+ */
+class CreateQueueWithInvalidContextFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        AclQueue queue = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateQueue(&queue, nullptr, nullptr) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(queue == nullptr);
+    };
+};
+
+/** Test-case for AclCreateQueue
+ *
+ * Validate that AclCreateQueue behaves as expected with invalid options
+ *
+ * Test Steps:
+ *  - Call AclCreateQueue with valid context but invalid options
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that queue is still nullptr
+ */
+template <acl::Target Target>
+class CreateQueuerWithInvalidOptionsFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context ctx(Target);
+
+        // Check invalid tuning mode
+        AclQueueOptions invalid_queue_opts;
+        invalid_queue_opts.mode = static_cast<AclTuningMode>(-1);
+
+        AclQueue queue = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateQueue(&queue, ctx.get(), &invalid_queue_opts) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(queue == nullptr);
+    };
+};
+
+/** Test case for AclDestroyQueue
+*
+* Validate that AclDestroyQueue behaves as expected when an invalid queue is given
+*
+* Test Steps:
+*  - Call AclDestroyQueue with null queue
+*  - Confirm that AclInvalidArgument is reported
+*  - Call AclDestroyQueue on empty array
+*  - Confirm that AclInvalidArgument is reported
+*  - Call AclDestroyQueue on an ACL object other than AclQueue
+*  - Confirm that AclInvalidArgument is reported
+*  - Confirm that queue is still nullptr
+*/
+template <acl::Target Target>
+class DestroyInvalidQueueFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context ctx(Target);
+
+        std::array<char, 256> empty_array{};
+        AclQueue queue = nullptr;
+
+        ARM_COMPUTE_ASSERT(AclDestroyQueue(queue) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyQueue(reinterpret_cast<AclQueue>(ctx.get())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyQueue(reinterpret_cast<AclQueue>(empty_array.data())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(queue == nullptr);
+    };
+};
+
+/** Test case for AclCreateQueue
+ *
+ * Validate that a queue can be created successfully
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid queue
+ *  - Confirm that AclSuccess is returned
+ */
+template <acl::Target Target>
+class SimpleQueueFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        acl::Queue queue(ctx, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+    };
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_UNIT_QUEUE_FIXTURE */
diff --git a/tests/validation/fixtures/UNIT/TensorFixture.h b/tests/validation/fixtures/UNIT/TensorFixture.h
new file mode 100644
index 0000000000..bfe115b3ed
--- /dev/null
+++ b/tests/validation/fixtures/UNIT/TensorFixture.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_UNIT_TENSOR_FIXTURE
+#define ARM_COMPUTE_TEST_UNIT_TENSOR_FIXTURE
+
+#include "arm_compute/Acl.hpp"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Test case for AclCreateTensor
+ *
+ * Validate that AclCreateTensor behaves as expected with invalid context
+ *
+ * Test Steps:
+ *  - Call AclCreateTensor with an invalid context
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that the tensor is still nullptr
+ */
+class CreateTensorWithInvalidContextFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        AclTensor tensor = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, nullptr, nullptr, false) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(tensor == nullptr);
+    };
+};
+
+/** Test-case for AclCreateTensor
+ *
+ * Validate that AclCreateTensor behaves as expected on invalid descriptor
+ *
+ * Test Steps:
+ *  - Call AclCreateTensor with valid context but invalid descriptor
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that tensor is still nullptr
+ */
+template <acl::Target Target>
+class CreateTensorWithInvalidDescriptorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context ctx(Target);
+        AclTensor    tensor = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, ctx.get(), nullptr, false) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(tensor == nullptr);
+
+        // Check invalid data type
+        AclTensorDescriptor invalid_desc;
+        invalid_desc.ndims     = 4;
+        invalid_desc.data_type = static_cast<AclDataType>(-1);
+        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, ctx.get(), &invalid_desc, false) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(tensor == nullptr);
+
+        // Check invalid number of dimensions
+        invalid_desc.data_type = AclDataType::AclFloat32;
+        invalid_desc.ndims     = 15;
+        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, ctx.get(), &invalid_desc, false) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(tensor == nullptr);
+    };
+};
+
+/** Test case for AclDestroyTensor
+*
+* Validate that AclDestroyTensor behaves as expected when an invalid tensor is given
+*
+* Test Steps:
+*  - Call AclDestroyTensor with null tensor
+*  - Confirm that AclInvalidArgument is reported
+*  - Call AclDestroyTensor on empty array
+*  - Confirm that AclInvalidArgument is reported
+*  - Call AclDestroyTensor on an ACL object other than AclTensor
+*  - Confirm that AclInvalidArgument is reported
+*  - Confirm that tensor is still nullptr
+*/
+template <acl::Target Target>
+class DestroyInvalidTensorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context ctx(Target);
+
+        std::array<char, 256> empty_array{};
+        AclTensor tensor = nullptr;
+
+        ARM_COMPUTE_ASSERT(AclDestroyTensor(tensor) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyTensor(reinterpret_cast<AclTensor>(ctx.get())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyTensor(reinterpret_cast<AclTensor>(empty_array.data())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(tensor == nullptr);
+    };
+};
+
+/** Test case for AclCreateTensor
+ *
+ * Validate that a tensor can be created successfully
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid tensor
+ *  - Confirm that AclSuccess is returned
+ */
+template <acl::Target Target>
+class SimpleTensorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+        acl::Context    ctx(Target, &err);
+
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+    };
+};
+
+/** Test case for AclTensor
+ *
+ * Validate that multiple tensors can be created successfully
+ * Stress the possibility of memory leaks
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a lot of tensors
+ *  - Confirm that AclSuccess is returned
+ */
+template <acl::Target Target>
+class TensorStressFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        const unsigned int num_tensors = 1024;
+        for(unsigned int i = 0; i < num_tensors; ++i)
+        {
+            acl::Tensor tensor(ctx, acl::TensorDescriptor({ 1024, 1024 }, acl::DataType::Float32), &err);
+            ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        }
+    };
+};
+
+/** Test case for AclMapTensor
+ *
+ * Validate that map on an invalid object fails
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Pass and invalid object for mapping
+ *  - Confirm that AclInvalidArgument is returned
+ */
+template <acl::Target Target>
+class MapInvalidTensorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        void *handle = nullptr;
+        ARM_COMPUTE_ASSERT(AclMapTensor(reinterpret_cast<AclTensor>(ctx.get()), &handle) == AclStatus::AclInvalidArgument);
+    };
+};
+
+/** Test case for AclMapTensor
+ *
+ * Validate that map of an unallocated pointer is nullptr
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid tensor without allocating
+ *  - Map tensor
+ *  - Check that mapping is nullptr
+ */
+template <acl::Target Target>
+class MapNotAllocatedTensorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 8, 8 }, acl::DataType::Float32), false /* allocate */, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        ARM_COMPUTE_ASSERT(tensor.map() == nullptr);
+    };
+};
+
+/** Test case for AclMapTensor
+ *
+ * Validate that map of a valid tensor return a non-nullptr value
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid tensor while allocating
+ *  - Map tensor
+ *  - Check that mapping is not nullptr
+ */
+template <acl::Target Target>
+class MapAllocatedTensorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 8, 8 }, acl::DataType::Float32), &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        void *handle = tensor.map();
+        ARM_COMPUTE_ASSERT(handle != nullptr);
+        ARM_COMPUTE_ASSERT(tensor.unmap(handle) == acl::StatusCode::Success);
+    };
+};
+
+/** Test case for AclTensorImport
+ *
+ * Validate that an externally memory can be successfully imported
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid tensor without allocating
+ *  - Allocate external memory
+ *  - Import memory to the tensor
+ *  - Check that imported pointer matches
+ */
+template <acl::Target Target>
+class ImportMemoryFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        const int32_t size = 8;
+        acl::Tensor   tensor(ctx, acl::TensorDescriptor({ size }, acl::DataType::Float32), false /* allocate */, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        std::vector<float> data(size);
+        err = tensor.import(data.data(), acl::ImportType::Host);
+
+        void *handle = tensor.map();
+        ARM_COMPUTE_ASSERT(handle == data.data());
+        ARM_COMPUTE_ASSERT(tensor.unmap(handle) == acl::StatusCode::Success);
+    }
+};
+/** Test case for get_size() interface of Tensor
+ *
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid tensor
+ *  - Compare the size value returned with the expected value
+ */
+template <acl::Target Target>
+class TensorSizeFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+        acl::Context    ctx(Target, &err);
+
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
+
+        // size should be 6 elements (2x3) times 4 bytes (float32) = 24 bytes
+        constexpr size_t expected_size = 24;
+        ARM_COMPUTE_ASSERT(tensor.get_size() == expected_size);
+    };
+};
+/** Test case for get_size() dealing with invalid arguments
+ *
+ * Test Steps:
+ *  - Test nullptr tensor can return a correct error
+ *  - Create a valid tensor
+ *  - Test C interface with null size argument can return a correct error
+ */
+template <acl::Target Target>
+class InvalidTensorSizeFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        // Null tensor
+        AclTensor null_tensor = nullptr;
+        uint64_t  size{ 0 };
+        ARM_COMPUTE_ASSERT(AclGetTensorSize(null_tensor, &size) == AclStatus::AclInvalidArgument);
+
+        // Create valid tensor
+        acl::StatusCode err = acl::StatusCode::Success;
+        acl::Context    ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
+
+        // Null size argument
+        ARM_COMPUTE_ASSERT(AclGetTensorSize(tensor.get(), nullptr) == AclStatus::AclInvalidArgument);
+    };
+};
+
+template <acl::Target Target>
+class DescriptorConversionFixture : public framework::Fixture
+{
+    bool compare_descriptor(const AclTensorDescriptor &desc_a, const AclTensorDescriptor &desc_b)
+    {
+        auto are_descriptors_same = true;
+
+        are_descriptors_same &= desc_a.ndims == desc_b.ndims;
+        are_descriptors_same &= desc_a.data_type == desc_b.data_type;
+        are_descriptors_same &= desc_a.shape != nullptr && desc_b.shape != nullptr;
+
+        for(int32_t d = 0; d < desc_a.ndims; ++d)
+        {
+            are_descriptors_same &= desc_a.shape[d] == desc_b.shape[d];
+        }
+
+        // other attributes should be added here
+
+        return are_descriptors_same;
+    }
+
+public:
+    void setup()
+    {
+        auto err{ acl::StatusCode::Success };
+        auto ctx{ acl::Context(Target, &err) };
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        auto        desc{ acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32) };
+        acl::Tensor tensor(ctx, desc, &err);
+
+        auto desc_from_tensor = tensor.get_descriptor();
+
+        ARM_COMPUTE_ASSERT(compare_descriptor(*desc.get(), *desc_from_tensor.get()));
+        ARM_COMPUTE_ASSERT(desc == desc_from_tensor);
+
+        // Test c interface with "prepopulated" descriptor
+        // Note: When c interface used, there are possibility of memory leak
+        // if members are not correctly deleted (e.g., shape).
+        // Since that is considered user's responsibility, we don't test here.
+        AclTensorDescriptor prepopulated_descriptor
+        {
+            3, nullptr, AclDataType::AclBFloat16, nullptr, 0
+        };
+
+        ARM_COMPUTE_ASSERT(AclGetTensorDescriptor(tensor.get(), &prepopulated_descriptor) == AclStatus::AclSuccess);
+        ARM_COMPUTE_ASSERT(compare_descriptor(*desc.get(), prepopulated_descriptor));
+        ARM_COMPUTE_ASSERT(desc == acl::TensorDescriptor(prepopulated_descriptor));
+    };
+};
+
+template <acl::Target Target>
+class InvalidDescriptorConversionFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        // Null tensor
+        AclTensor           null_tensor = nullptr;
+        AclTensorDescriptor desc{};
+        ARM_COMPUTE_ASSERT(AclGetTensorDescriptor(null_tensor, &desc) == AclStatus::AclInvalidArgument);
+
+        // Create valid tensor
+        acl::StatusCode err = acl::StatusCode::Success;
+        acl::Context    ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
+
+        // Null size argument
+        ARM_COMPUTE_ASSERT(AclGetTensorDescriptor(tensor.get(), nullptr) == AclStatus::AclInvalidArgument);
+    };
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_UNIT_TENSOR_FIXTURE */
diff --git a/tests/validation/fixtures/UNIT/TensorPackFixture.h b/tests/validation/fixtures/UNIT/TensorPackFixture.h
new file mode 100644
index 0000000000..bc14631936
--- /dev/null
+++ b/tests/validation/fixtures/UNIT/TensorPackFixture.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_UNIT_TENSORPACK_FIXTURE
+#define ARM_COMPUTE_TEST_UNIT_TENSORPACK_FIXTURE
+
+#include "arm_compute/Acl.hpp"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Test case for AclCreateTensorPack
+ *
+ * Validate that AclCreateTensorPack behaves as expected with invalid context
+ *
+ * Test Steps:
+ *  - Call AclCreateTensorPack with an invalid context
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that the tensor pack is still nullptr
+ */
+class CreateTensorPackWithInvalidContextFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        AclTensorPack pack = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateTensorPack(&pack, nullptr) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(pack == nullptr);
+    };
+};
+
+/** Test case for AclDestroyTensorPack
+ *
+ * Validate that AclDestroyTensorPack behaves as expected when an invalid tensor pack is given
+ *
+ * Test Steps:
+ *  - Call AclDestroyTensorPack with null tensor pack
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Call AclDestroyTensorPack on empty array
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Call AclDestroyTensorPack on an ACL object other than AclTensorPack
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that tensor pack is still nullptr
+ */
+template <acl::Target Target>
+class DestroyInvalidTensorPackFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context ctx(Target);
+
+        std::array<char, 256> empty_array{};
+        AclTensorPack pack = nullptr;
+
+        ARM_COMPUTE_ASSERT(AclDestroyTensorPack(pack) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyTensorPack(reinterpret_cast<AclTensorPack>(ctx.get())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyTensorPack(reinterpret_cast<AclTensorPack>(empty_array.data())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(pack == nullptr);
+    };
+};
+
+/** Test case for AclPackTensor
+ *
+ * Validate that AclPackTensor behaves as expected when an invalid is being passed for packing
+ *
+ * Test Steps:
+ *  - Create a valid TensorPack
+ *  - Try to pack an empty object
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Try to pack another API object other than tensor
+ *  - Confirm that AclInvalidArgument is reported
+ */
+template <acl::Target Target>
+class AddInvalidObjectToTensorPackFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        auto err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        acl::TensorPack pack(ctx, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        auto status = AclPackTensor(pack.get(),
+                                    reinterpret_cast<AclTensor>(ctx.get()),
+                                    AclTensorSlot::AclSrc);
+        ARM_COMPUTE_ASSERT(status == AclInvalidArgument);
+
+        status = AclPackTensor(pack.get(), nullptr, AclTensorSlot::AclSrc);
+        ARM_COMPUTE_ASSERT(status == AclInvalidArgument);
+    };
+};
+
+/** Test case for AclPackTensor
+ *
+ * Validate that a tensor can be added successfully to the TensorPack
+ *
+ * Test Steps:
+ *  - Create a valid tensor pack
+ *  - Create a valid tensor
+ *  - Add tensor to the tensor pack
+ *  - Confirm that AclSuccess is returned
+ */
+template <acl::Target Target>
+class SimpleTensorPackFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context    ctx(Target);
+        acl::TensorPack pack(ctx);
+        acl::Tensor     t(ctx, acl::TensorDescriptor({ 3, 3, 5, 7 }, acl::DataType::Float32));
+
+        ARM_COMPUTE_ASSERT(pack.add(t, AclTensorSlot::AclSrc) == acl::StatusCode::Success);
+    };
+};
+
+/** Test case for AclPackTensor
+ *
+ * Validate that multiple tensor can be added successfully to the TensorPack
+ *
+ * Test Steps:
+ *  - Create a valid tensor pack
+ *  - Create a list of valid tensors
+ *  - Add tensors to the tensor pack
+ *  - Confirm that AclSuccess is returned
+ */
+template <acl::Target Target>
+class MultipleTensorsInPackFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context    ctx(Target);
+        acl::TensorPack pack(ctx);
+
+        const acl::TensorDescriptor desc(acl::TensorDescriptor({ 3, 3, 5, 7 }, acl::DataType::Float32));
+        const size_t                num_tensors = 256;
+
+        std::vector<acl::Tensor> tensors;
+        for(unsigned int i = 0; i < num_tensors; ++i)
+        {
+            auto err = acl::StatusCode::Success;
+            tensors.emplace_back(acl::Tensor(ctx, desc, &err));
+            ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+            ARM_COMPUTE_ASSERT(pack.add(tensors.back(), static_cast<int32_t>(AclTensorSlot::AclSrcVec) + i) == acl::StatusCode::Success);
+        }
+    };
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_UNIT_TENSORPACK_FIXTURE */
diff --git a/tests/validation/fixtures/UNIT/WeightsRetentionFixture.h b/tests/validation/fixtures/UNIT/WeightsRetentionFixture.h
index af9f776ebc..f5e6071340 100644
--- a/tests/validation/fixtures/UNIT/WeightsRetentionFixture.h
+++ b/tests/validation/fixtures/UNIT/WeightsRetentionFixture.h
@@ -74,10 +74,10 @@ protected:
     TensorType compute_target()
     {
         // Create tensors
-        TensorType w1  = create_tensor<TensorType>(TensorShape(180000U, 150U), DataType::F32, 1);
-        TensorType b1  = create_tensor<TensorType>(TensorShape(150U), DataType::F32, 1);
-        TensorType src = create_tensor<TensorType>(TensorShape(1U, 150U, 1200U, _max_batches), DataType::F32, 1);
-        TensorType dst = create_tensor<TensorType>(TensorShape(150U, _max_batches), DataType::F32, 1);
+        TensorType w1  = create_tensor<TensorType>(TensorShape(6000U, 15U), DataType::F32, 1);
+        TensorType b1  = create_tensor<TensorType>(TensorShape(15U), DataType::F32, 1);
+        TensorType src = create_tensor<TensorType>(TensorShape(1U, 15U, 400U, _max_batches), DataType::F32, 1);
+        TensorType dst = create_tensor<TensorType>(TensorShape(15U, _max_batches), DataType::F32, 1);
 
         // Create and configure function
         FullyConnectedFunction fc_layer_1;
@@ -105,9 +105,9 @@ protected:
         int  diff            = _max_batches - _cur_batches;
         auto new_src_padding = PaddingSize(src_padding.top, src_padding.right, src_padding.bottom + diff, src_padding.left);
         auto new_dst_padding = PaddingSize(dst_padding.top, dst_padding.right, dst_padding.bottom + diff, dst_padding.left);
-        src.allocator()->info().set_tensor_shape(TensorShape(1U, 150U, 1200U, _cur_batches)).set_is_resizable(true).extend_padding(new_src_padding);
+        src.allocator()->info().set_tensor_shape(TensorShape(1U, 15U, 400U, _cur_batches)).set_is_resizable(true).extend_padding(new_src_padding);
         src.allocator()->info().set_is_resizable(false);
-        dst.allocator()->info().set_tensor_shape(TensorShape(150U, _cur_batches)).set_is_resizable(true).extend_padding(new_dst_padding);
+        dst.allocator()->info().set_tensor_shape(TensorShape(15U, _cur_batches)).set_is_resizable(true).extend_padding(new_dst_padding);
         dst.allocator()->info().set_is_resizable(false);
 
         // Configure FC info
@@ -129,16 +129,16 @@ protected:
     SimpleTensor<T> compute_reference()
     {
         // Create reference
-        SimpleTensor<T> w1{ TensorShape(180000U, 150U), DataType::F32 };
-        SimpleTensor<T> b1{ TensorShape(150U), DataType::F32 };
-        SimpleTensor<T> src{ TensorShape(1U, 150U, 1200U, _cur_batches), DataType::F32 };
+        SimpleTensor<T> w1{ TensorShape(6000U, 15U), DataType::F32 };
+        SimpleTensor<T> b1{ TensorShape(15U), DataType::F32 };
+        SimpleTensor<T> src{ TensorShape(1U, 15U, 400U, _cur_batches), DataType::F32 };
 
         // Fill reference
         fill(src, 5);
         fill(w1, 1);
         fill(b1, 2);
 
-        return reference::fully_connected_layer(src, w1, b1, TensorShape(150U, _cur_batches));
+        return reference::fully_connected_layer(src, w1, b1, TensorShape(15U, _cur_batches));
     }
 
 protected:
diff --git a/tests/validation/fixtures/UnstackFixture.h b/tests/validation/fixtures/UnstackFixture.h
index 53c79e180b..30b7dd5539 100644
--- a/tests/validation/fixtures/UnstackFixture.h
+++ b/tests/validation/fixtures/UnstackFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,6 @@ template <typename TensorType, typename ITensorType, typename AccessorType, type
 class UnstackValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, int axis, int num, DataType data_type)
     {
         _target    = compute_target(input_shape, axis, num, data_type);
@@ -80,10 +79,10 @@ protected:
         for(auto &out : output_slices)
         {
             out.allocator()->allocate();
-            ARM_COMPUTE_EXPECT(!out.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!out.info()->is_resizable());
         }
         input_tensor.allocator()->allocate();
-        ARM_COMPUTE_EXPECT(!input_tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!input_tensor.info()->is_resizable());
         fill(AccessorType(input_tensor), 0);
         // Compute function
         unstack.run();
diff --git a/tests/validation/fixtures/WeightsReshapeFixture.h b/tests/validation/fixtures/WeightsReshapeFixture.h
index 5c17b538d5..68bd8b689d 100644
--- a/tests/validation/fixtures/WeightsReshapeFixture.h
+++ b/tests/validation/fixtures/WeightsReshapeFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,10 +45,9 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class WeightsReshapeValidationFixture : public framework::Fixture
+class WeightsReshapeOpValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, DataType data_type, bool has_bias, unsigned int num_groups)
     {
         const TensorShape output_shape = compute_weights_reshaped_shape(TensorInfo(input_shape, 1, data_type), has_bias, num_groups);
@@ -73,34 +72,44 @@ protected:
 
         // Create and configure function
         FunctionType weights_reshape_func;
-        weights_reshape_func.configure(&src, (has_bias ? &bias : nullptr), &dst, num_groups);
+        weights_reshape_func.configure(src.info(), (has_bias ? bias.info() : nullptr), dst.info(), num_groups);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0);
 
         if(has_bias)
         {
-            ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
 
             bias.allocator()->allocate();
 
-            ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
 
             fill(AccessorType(bias), 1);
         }
 
+        arm_compute::ITensorPack pack =
+        {
+            { arm_compute::TensorType::ACL_SRC, &src },
+            { arm_compute::TensorType::ACL_DST, &dst }
+        };
+
+        if(has_bias)
+        {
+            pack.add_const_tensor(arm_compute::TensorType::ACL_BIAS, &bias);
+        }
         // Compute function
-        weights_reshape_func.run();
+        weights_reshape_func.run(pack);
 
         return dst;
     }
diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
index 03ec920c4e..20b678b36c 100644
--- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_WINOGRADCONVOLUTIONLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_WINOGRADCONVOLUTIONLAYERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -51,130 +51,36 @@ namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool use_bias = true>
-class WinogradConvolutionLayerValidationFixture : public framework::Fixture
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename T1 = T, bool use_bias = true, bool mixed_layout = false>
+class WinogradConvolutionLayerFastMathValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
-               DataType data_type, ActivationLayerInfo act_info)
+               DataType data_type, ActivationLayerInfo act_info, const DataLayout &data_layout)
+
     {
         ARM_COMPUTE_UNUSED(dilation);
-
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info);
+        _mixed_layout = mixed_layout;
+        _target       = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info, data_layout);
+        _reference    = compute_reference(input_shape, weights_shape, bias_shape, info, data_type, act_info);
     }
 
 protected:
-    template <typename U>
-    void fill(U &&tensor, int i, float min, float max)
-    {
-        switch(tensor.data_type())
-        {
-            case DataType::F16:
-            {
-                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ float(min), float(max) };
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            case DataType::F32:
-            {
-                std::uniform_real_distribution<float> distribution(min, max);
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not supported");
-            }
-        }
-    }
-
-    TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, const PadStrideInfo &info,
-                              DataType data_type, ActivationLayerInfo act_info)
-    {
-        // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1);
-        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, data_type, 1);
-        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1);
-
-        // Create and configure function
-        FunctionType conv;
-        ARM_COMPUTE_EXPECT(static_cast<bool>(conv.validate(src.info(), weights.info(), (use_bias) ? bias.info() : nullptr, dst.info(), info, act_info)), framework::LogLevel::ERRORS);
-        conv.configure(&src, &weights, (use_bias) ? &bias : nullptr, &dst, info, act_info);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        weights.allocator()->allocate();
-        dst.allocator()->allocate();
-        bias.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src), 0, -1.f, 1.f);
-        fill(AccessorType(weights), 1, -1.f, 1.f);
-        fill(AccessorType(bias), 2, -1.f, 1.f);
-
-        // Compute Winograd Convolution function
-        conv.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
-                                      DataType data_type, ActivationLayerInfo act_info)
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
     {
-        // Create reference
-        SimpleTensor<T> src{ input_shape, data_type, 1 };
-        SimpleTensor<T> weights{ weights_shape, data_type, 1 };
-        SimpleTensor<T> bias{ bias_shape, data_type, 1 };
+        const DataLayout data_layout = src.info()->data_layout();
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(data_layout == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
 
-        // Fill reference
-        fill(src, 0, -1.f, 1.f);
-        fill(weights, 1, -1.f, 1.f);
-        if(use_bias)
-        {
-            fill(bias, 2, -1.f, 1.f);
-        }
-        else
-        {
-            fill(bias, 2, 0.f, 0.f);
-        }
-
-        SimpleTensor<T> conv_out = reference::convolution_layer<T>(src, weights, bias, output_shape, info);
+        // Compute Convolution function
+        layer.run();
 
-        return (act_info.enabled()) ? reference::activation_layer<T>(conv_out, act_info) : conv_out;
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(data_layout);
+        dst.info()->set_data_layout(data_layout);
     }
 
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename T1 = T, bool use_bias = true>
-class WinogradConvolutionLayerFastMathValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
-               DataType data_type, ActivationLayerInfo act_info, const DataLayout &data_layout)
-
-    {
-        ARM_COMPUTE_UNUSED(dilation);
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info, data_layout);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, info, data_type, act_info);
-    }
-
-protected:
     template <typename U>
     void fill(U &&tensor, int i, float min, float max)
     {
@@ -221,10 +127,12 @@ protected:
                            framework::LogLevel::ERRORS);
         conv.configure(&src, &weights, (use_bias) ? &bias : nullptr, &dst, info, act_info, true /* Enable fast math */);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        add_padding_x({ &src, &weights, &bias, &dst }, data_layout);
 
         // Allocate tensors
         src.allocator()->allocate();
@@ -232,19 +140,25 @@ protected:
         dst.allocator()->allocate();
         bias.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0, -0.5f, 0.5f);
         fill(AccessorType(weights), 1, -0.5f, 0.5f);
         fill(AccessorType(bias), 2, -0.5f, 0.5f);
 
-        // Compute Winograd Convolution function
-        conv.run();
-
+        if(_mixed_layout)
+        {
+            mix_layout(conv, src, dst);
+        }
+        else
+        {
+            // Compute function
+            conv.run();
+        }
         return dst;
     }
 
@@ -315,28 +229,45 @@ protected:
         SimpleTensor<T1> filter_transform_out = reference::winograd_filter_transform<T1>(weights_t1, filter_transform_shape, winograd_info);
         SimpleTensor<T1> batched_gemm         = reference::gemm<T1>(input_transform_out, filter_transform_out, dummy_c, 1.0f, 0.0f);
         SimpleTensor<T1> conv_out             = reference::winograd_output_transform<T1>(batched_gemm, bias_t1, output_transform_shape, winograd_info);
-        SimpleTensor<T>  conv_out_t(std::move(copy_tensor<T, T1>(conv_out)));
+        SimpleTensor<T>  conv_out_t(copy_tensor<T, T1>(conv_out));
         return (act_info.enabled()) ? reference::activation_layer<T>(conv_out_t, act_info) : conv_out_t;
     }
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
+    bool            _mixed_layout{ false };
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class WinogradInputTransformValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, WinogradInfo winograd_info, DataLayout data_layout, DataType data_type)
     {
         TensorShape output_shape = compute_winograd_input_transform_shape(TensorInfo(input_shape, 1, data_type), winograd_info);
-
-        _target    = compute_target(input_shape, output_shape, winograd_info, data_layout, data_type);
-        _reference = compute_reference(input_shape, output_shape, winograd_info, data_type);
+        _mixed_layout            = mixed_layout;
+        _target                  = compute_target(input_shape, output_shape, winograd_info, data_layout, data_type);
+        _reference               = compute_reference(input_shape, output_shape, winograd_info, data_type);
     }
 
 protected:
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
+    {
+        const DataLayout data_layout_src = src.info()->data_layout();
+        const DataLayout data_layout_dst = dst.info()->data_layout();
+
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(data_layout_src == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(data_layout_dst == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+
+        // Compute Convolution function
+        layer.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(data_layout_src);
+        dst.info()->set_data_layout(data_layout_dst);
+    }
+
     template <typename U>
     void fill(U &&tensor, int i, float min, float max)
     {
@@ -375,22 +306,30 @@ protected:
         FunctionType transf;
         transf.configure(&src, &dst, winograd_info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        add_padding_x({ &src, &dst }, data_layout);
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0, -1.f, 1.f);
 
-        // Compute Winograd input transform function
-        transf.run();
-
+        if(_mixed_layout)
+        {
+            mix_layout(transf, src, dst);
+        }
+        else
+        {
+            // Compute Winograd input transform function
+            transf.run();
+        }
         return dst;
     }
 
@@ -405,25 +344,43 @@ protected:
         return reference::winograd_input_transform<T>(src, output_shape, winograd_info);
     }
 
+    bool            _mixed_layout{ false };
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class WinogradFilterTransformValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, Size2D output_tile, DataLayout data_layout, DataType data_type)
     {
         WinogradInfo winograd_info(output_tile, Size2D(input_shape[0], input_shape[1]), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW /* Not needed */);
         TensorShape  output_shape = compute_winograd_filter_transform_shape(TensorInfo(input_shape, 1, data_type), winograd_info);
 
-        _target    = compute_target(input_shape, output_shape, winograd_info, data_layout, data_type);
-        _reference = compute_reference(input_shape, output_shape, winograd_info, data_type);
+        _mixed_layout = mixed_layout;
+        _target       = compute_target(input_shape, output_shape, winograd_info, data_layout, data_type);
+        _reference    = compute_reference(input_shape, output_shape, winograd_info, data_type);
     }
 
 protected:
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
+    {
+        const DataLayout data_layout_src = src.info()->data_layout();
+        const DataLayout data_layout_dst = dst.info()->data_layout();
+
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(data_layout_src == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(data_layout_dst == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+
+        // Compute Convolution function
+        layer.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(data_layout_src);
+        dst.info()->set_data_layout(data_layout_dst);
+    }
+
     template <typename U>
     void fill(U &&tensor, int i, float min, float max)
     {
@@ -463,21 +420,30 @@ protected:
         FunctionType filter_transform;
         filter_transform.configure(&src, &dst, winograd_info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        add_padding_x({ &src, &dst }, data_layout);
 
         // Allocate tensors
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0, -1.f, 1.f);
 
-        filter_transform.run();
-
+        if(_mixed_layout)
+        {
+            mix_layout(filter_transform, src, dst);
+        }
+        else
+        {
+            // Compute Winograd filter transform function
+            filter_transform.run();
+        }
         return dst;
     }
 
@@ -492,15 +458,15 @@ protected:
         return reference::winograd_filter_transform<T>(src, output_shape, winograd_info);
     }
 
+    bool            _mixed_layout{ false };
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
 class WinogradOutputTransformValidationFixture : public framework::Fixture
 {
 public:
-    template <typename...>
     void setup(TensorShape input_shape, WinogradInfo winograd_info, DataType data_type, ActivationLayerInfo act_info = ActivationLayerInfo())
     {
         _target    = compute_target(input_shape, winograd_info, data_type, act_info);
@@ -508,6 +474,23 @@ public:
     }
 
 protected:
+    void mix_layout(FunctionType &layer, TensorType &src, TensorType &dst)
+    {
+        const DataLayout data_layout_src = src.info()->data_layout();
+        const DataLayout data_layout_dst = dst.info()->data_layout();
+
+        // Test Multi DataLayout graph cases, when the data layout changes after configure
+        src.info()->set_data_layout(data_layout_src == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+        dst.info()->set_data_layout(data_layout_dst == DataLayout::NCHW ? DataLayout::NHWC : DataLayout::NCHW);
+
+        // Compute Convolution function
+        layer.run();
+
+        // Reinstating original data layout for the test suite to properly check the values
+        src.info()->set_data_layout(data_layout_src);
+        dst.info()->set_data_layout(data_layout_dst);
+    }
+
     template <typename U>
     void fill(U &&tensor, int i, float min, float max)
     {
@@ -545,25 +528,34 @@ protected:
         FunctionType output_transform;
         output_transform.configure(&src, &bias, &dst, winograd_info, act_info);
 
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        add_padding_x({ &src, &bias, &dst }, winograd_info.output_data_layout);
 
         // Allocate tensors
         src.allocator()->allocate();
         bias.allocator()->allocate();
         dst.allocator()->allocate();
 
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
         fill(AccessorType(src), 0, -1.f, 1.f);
         fill(AccessorType(bias), 1, -1.f, 1.f);
 
-        output_transform.run();
-
+        if(_mixed_layout)
+        {
+            mix_layout(output_transform, src, dst);
+        }
+        else
+        {
+            // Compute Winograd output transform function
+            output_transform.run();
+        }
         return dst;
     }
 
@@ -585,10 +577,11 @@ protected:
         return (act_info.enabled()) ? reference::activation_layer<T>(winograd_output, act_info) : winograd_output;
     }
 
+    bool            _mixed_layout{ false };
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_WINOGRADCONVOLUTIONLAYERFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h
new file mode 100644
index 0000000000..ca4de11a15
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DEPTHWISECONV2DFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DEPTHWISECONV2DFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/reference/DepthwiseConvolutionLayer.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuDepthwiseConv2dValidationGenericFixture : public framework::Fixture
+{
+public:
+    using TBias = typename std::conditional<std::is_same<typename std::decay<T>::type, uint8_t>::value ||
+                                                std::is_same<typename std::decay<T>::type, int8_t>::value,
+                                            int32_t,
+                                            T>::type; // If T: uint8_t or int8_t then TBias: int32_t, otherwise TBias: T
+
+    void setup(TensorShape          input_shape,
+               Size2D               kernel_size,
+               const PadStrideInfo &pad_stride,
+               const Size2D        &dilation,
+               const unsigned int   depth_multiplier,
+               const DataType       data_type,
+               const DataLayout     data_layout)
+    {
+        ARM_COMPUTE_ERROR_ON(data_layout !=
+                             DataLayout::NHWC); // Dynamic fusion depthwise conv2d only supports NHWC layout
+
+        DepthwiseConv2dAttributes dwc_conv2d_attr;
+        const Padding2D           padding_2d(pad_stride.pad_left(), pad_stride.pad_right(), pad_stride.pad_top(),
+                                             pad_stride.pad_bottom());
+        dwc_conv2d_attr.pad(padding_2d)
+            .stride(Size2D(pad_stride.stride().first, pad_stride.stride().second))
+            .dilation(dilation)
+            .depth_multiplier(depth_multiplier)
+            .dimension_rounding_type(pad_stride.round());
+
+        // Calculate Output and Weight Shapes
+        TensorShape weights_shape = TensorShape(kernel_size.width, kernel_size.height);
+
+        const TensorInfo in_info(input_shape, 1, data_type);
+        const TensorInfo we_info(weights_shape, 1, data_type);
+
+        const ConvolutionInfo info{pad_stride, depth_multiplier, ActivationLayerInfo(), dilation};
+        const TensorShape     output_shape =
+            misc::shape_calculator::compute_depthwise_convolution_shape(in_info, we_info, info);
+
+        weights_shape.set(2, output_shape.z());
+        const TensorShape bias_shape = TensorShape(weights_shape[2]);
+
+        _data_type   = data_type;
+        _data_layout = data_layout;
+        _target      = compute_target(input_shape, weights_shape, bias_shape, dwc_conv2d_attr);
+        _reference   = compute_reference(input_shape, weights_shape, bias_shape, output_shape, dwc_conv2d_attr);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch (tensor.data_type())
+        {
+            case DataType::F16:
+            {
+                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{-1.0f, 1.0f};
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+                library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    // Given input is in nchw format
+    TensorType compute_target(TensorShape                     input_shape,
+                              TensorShape                     weights_shape,
+                              const TensorShape              &bias_shape,
+                              const DepthwiseConv2dAttributes dwc_conv2d_attr)
+    {
+        ARM_COMPUTE_ERROR_ON(_data_layout != DataLayout::NHWC);
+
+        // Our test shapes are assumed in NCHW data layout, thus the permutation
+        permute(input_shape, PermutationVector(2U, 0U, 1U));
+        permute(weights_shape, PermutationVector(2U, 0U, 1U));
+
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch sketch{&context};
+
+        // Create sketch tensors
+        ITensorInfo *input_info  = context.create_tensor_info(TensorInfo(input_shape, 1, _data_type, _data_layout));
+        ITensorInfo *weight_info = context.create_tensor_info(TensorInfo(weights_shape, 1, _data_type, _data_layout));
+        ITensorInfo *bias_info   = context.create_tensor_info(TensorInfo(bias_shape, 1, _data_type, _data_layout));
+        ITensorInfo *dst_info    = context.create_tensor_info();
+
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, input_info, weight_info, bias_info, dwc_conv2d_attr);
+        GpuOutput::create_op(sketch, ans_info, dst_info);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        // (Important) Allocate auxiliary tensor memory if there are any
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+
+        // Construct user tensors
+        TensorType t_input{};
+        TensorType t_weight{};
+        TensorType t_bias{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_input.allocator()->init(*input_info);
+        t_weight.allocator()->init(*weight_info);
+        t_bias.allocator()->init(*bias_info);
+        t_dst.allocator()->init(*dst_info);
+
+        // Allocate and fill user tensors
+        t_input.allocator()->allocate();
+        t_weight.allocator()->allocate();
+        t_bias.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        fill(AccessorType(t_input), 0);
+        fill(AccessorType(t_weight), 1);
+        fill(AccessorType(t_bias), 2);
+
+        // Run runtime
+        runtime.run({&t_input, &t_weight, &t_bias, &t_dst});
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape        &input_shape,
+                                      const TensorShape        &weights_shape,
+                                      const TensorShape        &bias_shape,
+                                      const TensorShape        &output_shape,
+                                      DepthwiseConv2dAttributes dwc_conv2d_attr)
+    {
+        // Create reference
+        SimpleTensor<T>     src{input_shape, _data_type, 1};
+        SimpleTensor<T>     weight{weights_shape, _data_type, 1};
+        SimpleTensor<TBias> bias{bias_shape, _data_type, 1};
+
+        fill(src, 0);
+        fill(weight, 1);
+        fill(bias, 2);
+
+        auto src_nchw          = src;
+        auto weights_nchw      = weight;
+        auto bias_nchw         = bias;
+        auto output_shape_nchw = output_shape;
+
+        PadStrideInfo legacy_pad_stride(dwc_conv2d_attr.stride().x(), dwc_conv2d_attr.stride().y(),
+                                        dwc_conv2d_attr.pad().left, dwc_conv2d_attr.pad().right,
+                                        dwc_conv2d_attr.pad().top, dwc_conv2d_attr.pad().bottom,
+                                        DimensionRoundingType{});
+        auto          dst_nchw =
+            reference::depthwise_convolution(src_nchw, weights_nchw, bias_nchw, output_shape_nchw, legacy_pad_stride,
+                                             dwc_conv2d_attr.depth_multiplier(), dwc_conv2d_attr.dilation());
+        return dst_nchw;
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+    DataType        _data_type{};
+    DataLayout      _data_layout{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuDepthwiseConv2dValidationFixture
+    : public DynamicFusionGpuDepthwiseConv2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape          input_shape,
+               Size2D               kernel_size,
+               const PadStrideInfo &info,
+               const Size2D        &dilation,
+               const unsigned int   depth_multiplier,
+               DataType             data_type,
+               DataLayout           data_layout)
+    {
+        DynamicFusionGpuDepthwiseConv2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            input_shape, kernel_size, info, dilation, depth_multiplier, data_type, data_layout);
+    }
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DEPTHWISECONV2DFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h
new file mode 100644
index 0000000000..1f4e223b93
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DIRECTCONV2DFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DIRECTCONV2DFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+template <typename U>
+void fill(U &&tensor, int i)
+{
+    switch (tensor.data_type())
+    {
+        case DataType::F16:
+        {
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{-1.0f, 1.0f};
+            library->fill(tensor, distribution, i);
+            break;
+        }
+        case DataType::F32:
+        {
+            std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+            library->fill(tensor, distribution, i);
+            break;
+        }
+        default:
+            library->fill_tensor_uniform(tensor, i);
+    }
+}
+
+} // namespace
+
+/** General Conv2d fixture
+ *  Adapted from tests/validation/fixtures/ConvolutionLayerFixture.h
+ *  TODO: Parameterize to be fully backend agnostic: COMPMID-5760; remove Gpu from name
+ */
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuConv2dValidationGenericFixture : public framework::Fixture
+{
+public:
+    using TBias = typename std::conditional<std::is_same<typename std::decay<T>::type, uint8_t>::value ||
+                                                std::is_same<typename std::decay<T>::type, int8_t>::value,
+                                            int32_t,
+                                            T>::type; // If T: uint8_t or int8_t then TBias: int32_t, otherwise TBias: T
+
+    void setup(TensorShape          input_shape,
+               TensorShape          weights_shape,
+               TensorShape          bias_shape,
+               TensorShape          output_shape,
+               const PadStrideInfo &info,
+               const Size2D        &dilation,
+               DataType             data_type,
+               DataLayout           data_layout,
+               QuantizationInfo     quantization_info,
+               QuantizationInfo     weight_quantization_info)
+    {
+        ARM_COMPUTE_ERROR_ON(data_layout != DataLayout::NHWC); // Dynamic fusion conv2d only supports NHWC layout
+        const Conv2dAttributes conv2d_attr = convert_pad_stride_info_to_conv_attr(info, dilation);
+        _data_type                         = data_type;
+        _data_layout                       = data_layout;
+        _is_quantized                      = is_data_type_quantized_asymmetric(data_type);
+        _quantization_info                 = quantization_info;
+        _weight_quantization_info          = weight_quantization_info;
+        _bias_data_type                    = _is_quantized ? DataType::S32 : data_type;
+        _target                            = compute_target(input_shape, weights_shape, bias_shape, conv2d_attr);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, conv2d_attr);
+    }
+
+protected:
+    // Given input is in nchw format
+    TensorType compute_target(TensorShape        input_shape,
+                              TensorShape        weights_shape,
+                              const TensorShape &bias_shape,
+                              Conv2dAttributes   conv2d_attr)
+    {
+        ARM_COMPUTE_ERROR_ON(_data_layout != DataLayout::NHWC);
+        permute(input_shape, PermutationVector(2U, 0U, 1U));
+        permute(weights_shape, PermutationVector(2U, 0U, 1U));
+        CLScheduler::get().default_reinit();
+
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch sketch{&context};
+
+        // Create sketch tensors
+        ITensorInfo *input_info  = context.create_tensor_info(TensorInfo(input_shape, 1, _data_type, _data_layout));
+        ITensorInfo *weight_info = context.create_tensor_info(TensorInfo(weights_shape, 1, _data_type, _data_layout));
+        ITensorInfo *bias_info   = context.create_tensor_info(TensorInfo(bias_shape, 1, _data_type, _data_layout));
+        ITensorInfo *dst_info    = context.create_tensor_info();
+
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, input_info, weight_info, bias_info, conv2d_attr);
+        GpuOutput::create_op(sketch, ans_info, dst_info);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+        // (Important) Allocate auxiliary tensor memory if there are any
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+        // Construct user tensors
+        TensorType t_input{};
+        TensorType t_weight{};
+        TensorType t_bias{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_input.allocator()->init(*input_info);
+        t_weight.allocator()->init(*weight_info);
+        t_bias.allocator()->init(*bias_info);
+        t_dst.allocator()->init(*dst_info);
+
+        // Allocate and fill user tensors
+        t_input.allocator()->allocate();
+        t_weight.allocator()->allocate();
+        t_bias.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        fill(AccessorType(t_input), 0);
+        fill(AccessorType(t_weight), 1);
+        fill(AccessorType(t_bias), 2);
+
+        // Run runtime
+        runtime.run({&t_input, &t_weight, &t_bias, &t_dst});
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape,
+                                      const TensorShape &weights_shape,
+                                      const TensorShape &bias_shape,
+                                      const TensorShape &output_shape,
+                                      Conv2dAttributes   conv2d_attr)
+    {
+        // Create reference
+        SimpleTensor<T>     src{input_shape, _data_type, 1, _quantization_info};
+        SimpleTensor<T>     weight{weights_shape, _data_type, 1, _weight_quantization_info};
+        SimpleTensor<TBias> bias{bias_shape, _data_type, 1, _quantization_info};
+
+        fill(src, 0);
+        fill(weight, 1);
+        fill(bias, 2);
+
+        auto src_nchw          = src;
+        auto weights_nchw      = weight;
+        auto bias_nchw         = bias;
+        auto output_shape_nchw = output_shape;
+
+        PadStrideInfo legacy_pad_stride(conv2d_attr.stride().x(), conv2d_attr.stride().y(), conv2d_attr.pad().left,
+                                        conv2d_attr.pad().right, conv2d_attr.pad().top, conv2d_attr.pad().bottom,
+                                        DimensionRoundingType{});
+        auto          dst_nchw = reference::convolution_layer(src_nchw, weights_nchw, bias_nchw, output_shape_nchw,
+                                                              legacy_pad_stride, conv2d_attr.dilation());
+        return dst_nchw;
+    }
+
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+    DataType         _data_type{};
+    DataType         _bias_data_type{};
+    DataLayout       _data_layout{};
+    QuantizationInfo _quantization_info{};
+    QuantizationInfo _weight_quantization_info{};
+    bool             _is_quantized = false;
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuConv2dValidationFixture
+    : public DynamicFusionGpuConv2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape          input_shape,
+               TensorShape          weights_shape,
+               TensorShape          output_shape,
+               TensorShape          bias_shape,
+               const PadStrideInfo &info,
+               const Size2D        &dialation,
+               DataType             data_type,
+               DataLayout           data_layout,
+               QuantizationInfo     quantization_info)
+    {
+        DynamicFusionGpuConv2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            input_shape, weights_shape, output_shape, bias_shape, info, dialation, data_type, data_layout,
+            quantization_info, quantization_info);
+    }
+};
+
+/** Specific Conv2d method: Direct Conv2d fixture
+ *  Adapted from tests/validation/fixtures/DirectConvolutionLayerFixture.h
+ *  TODO: Parameterize to be fully backend agnostic: COMPMID-5760
+ */
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionDirectConv2dValidationGenericFixture : public framework::Fixture
+{
+public:
+    using TBias =
+        typename std::conditional<std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int32_t, T>::type;
+
+    void setup(TensorShape      input_shape,
+               int              stride_x,
+               int              stride_y,
+               int              pad_x,
+               int              pad_y,
+               unsigned int     kernel_size,
+               unsigned int     num_kernels,
+               DataType         data_type,
+               QuantizationInfo quantization_info,
+               DataLayout       data_layout)
+    {
+        ARM_COMPUTE_ERROR_ON(data_layout != DataLayout::NHWC); // Dynamic fusion conv2d only supports NHWC layout
+
+        TensorShape         weights_shape(kernel_size, kernel_size, input_shape.z(), num_kernels);
+        const TensorShape   bias_shape(num_kernels);
+        const PadStrideInfo info(stride_x, stride_y, pad_x, pad_y, DimensionRoundingType::FLOOR);
+        const DataType      bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+
+        const Conv2dAttributes conv2d_attr = convert_pad_stride_info_to_conv_attr(info, {1U, 1U} /* dilation */);
+
+        TensorInfo input_info   = TensorInfo(input_shape, 1, data_type);
+        TensorInfo weights_info = TensorInfo(weights_shape, 1, data_type);
+
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_deep_convolution_shape(input_info, weights_info, info);
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, conv2d_attr, data_type,
+                                    bias_data_type, quantization_info, data_layout);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type,
+                                       bias_data_type, quantization_info);
+    }
+
+protected:
+    TensorType compute_target(TensorShape             input_shape,
+                              TensorShape             weights_shape,
+                              const TensorShape      &bias_shape,
+                              TensorShape             output_shape,
+                              const Conv2dAttributes &conv2d_attr,
+                              DataType                data_type,
+                              DataType                bias_data_type,
+                              QuantizationInfo        quantization_info,
+                              const DataLayout       &data_layout)
+    {
+        ARM_COMPUTE_ERROR_ON(data_layout != DataLayout::NHWC);
+        ARM_COMPUTE_UNUSED(quantization_info);
+        // Dataset shapes are in NCHW layout
+        permute(input_shape, PermutationVector(2U, 0U, 1U));
+        permute(weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(output_shape, PermutationVector(2U, 0U, 1U));
+
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch sketch{&context};
+
+        // Create sketch tensors
+        auto input_info  = context.create_tensor_info(TensorInfo(input_shape, 1, data_type, data_layout));
+        auto weight_info = context.create_tensor_info(TensorInfo(weights_shape, 1, data_type, data_layout));
+        auto bias_info   = context.create_tensor_info(TensorInfo(bias_shape, 1, bias_data_type, data_layout));
+        auto dst_info    = context.create_tensor_info();
+
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, input_info, weight_info, bias_info, conv2d_attr);
+        GpuOutput::create_op(sketch, ans_info, dst_info);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+        // Construct user tensors
+        TensorType t_input{};
+        TensorType t_weight{};
+        TensorType t_bias{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_input.allocator()->init(*input_info);
+        t_weight.allocator()->init(*weight_info);
+        t_bias.allocator()->init(*bias_info);
+        t_dst.allocator()->init(*dst_info);
+
+        ARM_COMPUTE_ASSERT(t_input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(t_weight.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(t_bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(t_dst.info()->is_resizable());
+
+        // Allocate and fill user tensors
+        t_input.allocator()->allocate();
+        t_weight.allocator()->allocate();
+        t_bias.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!t_input.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!t_weight.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!t_bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!t_dst.info()->is_resizable());
+
+        fill(AccessorType(t_input), 0);
+        fill(AccessorType(t_weight), 1);
+        fill(AccessorType(t_bias), 2);
+
+        // Run runtime
+        runtime.run({&t_input, &t_weight, &t_bias, &t_dst});
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape   &input_shape,
+                                      const TensorShape   &weights_shape,
+                                      const TensorShape   &bias_shape,
+                                      const TensorShape   &output_shape,
+                                      const PadStrideInfo &info,
+                                      DataType             data_type,
+                                      DataType             bias_data_type,
+                                      QuantizationInfo     quantization_info)
+    {
+        // Create reference
+        SimpleTensor<T>     src{input_shape, data_type, 1, quantization_info};
+        SimpleTensor<T>     weights{weights_shape, data_type, 1, quantization_info};
+        SimpleTensor<TBias> bias{bias_shape, bias_data_type, 1, quantization_info};
+
+        // Fill reference
+        fill(src, 0);
+        fill(weights, 1);
+        fill(bias, 2);
+
+        SimpleTensor<T> dst = reference::convolution_layer<T>(src, weights, bias, output_shape, info);
+        return dst;
+    }
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionDirectConv2dValidationFixture
+    : public DynamicFusionDirectConv2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape  input_shape,
+               int          stride_x,
+               int          stride_y,
+               int          pad_x,
+               int          pad_y,
+               unsigned int kernel_size,
+               unsigned int num_kernels,
+               DataType     data_type,
+               DataLayout   data_layout)
+    {
+        DynamicFusionDirectConv2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, QuantizationInfo(),
+            data_layout);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DIRECTCONV2DFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h
new file mode 100644
index 0000000000..69bd0efbdc
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_ELEMENTWISEBINARYFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_ELEMENTWISEBINARYFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuElementwiseBinaryValidationGenericFixture : public framework::Fixture
+{
+public:
+    void setup(ArithmeticOperation ref_op,
+               const TensorShape  &shape0,
+               const TensorShape  &shape1,
+               const TensorShape  &shape2,
+               DataType            data_type,
+               bool                is_inplace,
+               bool                fuse_two_ops = false)
+    {
+        _ref_op     = ref_op;
+        _is_inplace = is_inplace;
+        _data_type  = data_type;
+        _fuse       = fuse_two_ops;
+        ARM_COMPUTE_ERROR_ON_MSG(_fuse && shape2.total_size() == 0, "No shape2 provided for fusion of two ops.");
+        ARM_COMPUTE_ERROR_ON_MSG(_fuse && _is_inplace, "In place for fusing case not supported yet.");
+        _target    = compute_target(shape0, shape1, shape2);
+        _reference = compute_reference(shape0, shape1, shape2);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        if (is_data_type_float(tensor.data_type()))
+        {
+            switch (_ref_op)
+            {
+                case ArithmeticOperation::DIV:
+                    library->fill_tensor_uniform_ranged(tensor, i, {std::pair<float, float>(-0.001f, 0.001f)});
+                    break;
+                case ArithmeticOperation::POWER:
+                    library->fill_tensor_uniform(tensor, i, 0.0f, 5.0f);
+                    break;
+                default:
+                    library->fill_tensor_uniform(tensor, i);
+            }
+        }
+        else if (tensor.data_type() == DataType::S32)
+        {
+            switch (_ref_op)
+            {
+                case ArithmeticOperation::DIV:
+                    library->fill_tensor_uniform_ranged(tensor, i, {std::pair<int32_t, int32_t>(-1U, 1U)});
+                    break;
+                default:
+                    library->fill_tensor_uniform(tensor, i);
+            }
+        }
+        else
+        {
+            library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    TensorType compute_target(const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2)
+    {
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch sketch{&context};
+
+        // Fuse first element wise binary Op
+        ITensorInfo *lhs_info = context.create_tensor_info(TensorInfo(shape0, 1, _data_type));
+        ITensorInfo *rhs_info = context.create_tensor_info(TensorInfo(shape1, 1, _data_type));
+        ITensorInfo *dst_info = context.create_tensor_info();
+
+        ITensorInfo *rhs_info_fuse = nullptr;
+
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, lhs_info, rhs_info);
+
+        if (_fuse)
+        {
+            rhs_info_fuse          = context.create_tensor_info(TensorInfo(shape2, 1, _data_type));
+            ITensorInfo *ans2_info = FunctionType::create_op(sketch, ans_info, rhs_info_fuse);
+            GpuOutput::create_op(sketch, ans2_info, dst_info);
+        }
+        else
+        {
+            GpuOutput::create_op(sketch, ans_info, dst_info);
+        }
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        // (Important) Allocate auxiliary tensor memory if there are any
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+
+        // Construct user tensors
+        TensorType t_lhs{};
+        TensorType t_rhs{};
+        TensorType t_rhs_fuse{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_lhs.allocator()->init(*lhs_info);
+        t_rhs.allocator()->init(*rhs_info);
+        t_dst.allocator()->init(*dst_info);
+        if (_fuse)
+        {
+            t_rhs_fuse.allocator()->init(*rhs_info_fuse);
+        }
+
+        // Allocate and fill user tensors
+        // Instead of using ACL allocator, the user can choose to import memory into the tensors
+        t_lhs.allocator()->allocate();
+        t_rhs.allocator()->allocate();
+        t_dst.allocator()->allocate();
+        if (_fuse)
+        {
+            t_rhs_fuse.allocator()->allocate();
+        }
+
+        fill(AccessorType(t_lhs), 0);
+        fill(AccessorType(t_rhs), 1);
+        if (_fuse)
+        {
+            fill(AccessorType(t_rhs_fuse), 2);
+        }
+
+        // Run runtime
+        if (_fuse)
+        {
+            runtime.run({&t_lhs, &t_rhs, &t_rhs_fuse, &t_dst});
+        }
+        else
+        {
+            runtime.run({&t_lhs, &t_rhs, &t_dst});
+        }
+
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2)
+    {
+        const TensorShape out_shape      = TensorShape::broadcast_shape(shape0, shape1);
+        const TensorShape out_shape_fuse = TensorShape::broadcast_shape(out_shape, shape1);
+
+        // Create reference
+        SimpleTensor<T> ref_lhs{shape0, _data_type, 1, QuantizationInfo()};
+        SimpleTensor<T> ref_rhs{shape1, _data_type, 1, QuantizationInfo()};
+        SimpleTensor<T> ref_rhs_fuse{shape2, _data_type, 1, QuantizationInfo()};
+        SimpleTensor<T> ref_dst{out_shape, _data_type, 1, QuantizationInfo()};
+        SimpleTensor<T> ref_dst_fuse{out_shape_fuse, _data_type, 1, QuantizationInfo()};
+
+        // Fill reference
+        fill(ref_lhs, 0);
+        fill(ref_rhs, 1);
+
+        reference::arithmetic_operation<T>(_ref_op, ref_lhs, ref_rhs, ref_dst, ConvertPolicy::WRAP);
+        if (_fuse)
+        {
+            fill(ref_rhs_fuse, 2);
+            reference::arithmetic_operation<T>(_ref_op, ref_dst, ref_rhs_fuse, ref_dst_fuse, ConvertPolicy::WRAP);
+        }
+        SimpleTensor<T> *ret = _fuse ? &ref_dst_fuse : &ref_dst;
+        return *ret;
+    }
+
+    ArithmeticOperation _ref_op{ArithmeticOperation::ADD};
+    TensorType          _target{};
+    SimpleTensor<T>     _reference{};
+    DataType            _data_type{};
+    DataLayout          _data_layout{};
+    bool                _is_inplace{false};
+    bool                _fuse{false};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuElementwiseBinaryOneOpValidationFixture
+    : public DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(ArithmeticOperation ref_op, const TensorShape &shape0, DataType data_type, bool is_inplace)
+    {
+        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            ref_op, shape0, shape0, TensorShape(), data_type, is_inplace);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture
+    : public DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(ArithmeticOperation ref_op,
+               const TensorShape  &shape0,
+               const TensorShape  &shape1,
+               DataType            data_type,
+               bool                is_inplace)
+    {
+        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            ref_op, shape0, shape1, TensorShape(), data_type, is_inplace);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture
+    : public DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(ArithmeticOperation ref_op,
+               const TensorShape  &shape0,
+               const TensorShape  &shape1,
+               const TensorShape  &shape2,
+               DataType            data_type,
+               bool                is_inplace,
+               bool                fuse_two_ops)
+    {
+        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            ref_op, shape0, shape1, shape2, data_type, is_inplace, fuse_two_ops);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_ELEMENTWISEBINARYFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h
new file mode 100644
index 0000000000..4c1cc94d3d
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/ReshapeLayer.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+template <typename U>
+void fill(U &&tensor, int i)
+{
+    switch (tensor.data_type())
+    {
+        case DataType::F16:
+        {
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{-1.0f, 1.0f};
+            library->fill(tensor, distribution, i);
+            break;
+        }
+        case DataType::F32:
+        {
+            std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+            library->fill(tensor, distribution, i);
+            break;
+        }
+        default:
+            library->fill_tensor_uniform(tensor, i);
+    }
+}
+
+} // namespace
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuMatMulValidationGenericFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape lhs_shape,
+               TensorShape rhs_shape,
+               TensorShape output_shape,
+               bool        transpose_a,
+               bool        transpose_b,
+               int         M0,
+               int         N0,
+               int         K0,
+               bool        export_rhs_to_cl_image,
+               DataType    data_type)
+    {
+        //For brevity, the input shapes are assumed to be not-transposed for both a and b matrices.
+        if (transpose_a)
+        {
+            permute(lhs_shape, PermutationVector(1U, 0U));
+        }
+        if (transpose_b)
+        {
+            permute(rhs_shape, PermutationVector(1U, 0U));
+        }
+
+        // Skip configurations unsupported by the device.
+        _device_supports_export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device());
+        if (!_device_supports_export_to_cl_image && export_rhs_to_cl_image)
+        {
+            ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+            framework::ARM_COMPUTE_PRINT_INFO();
+            return; // Note: Also need to skip the validate in corresponding FIXTURE_DATA_TEST_CASEs.
+        }
+
+        _target    = compute_target(lhs_shape, rhs_shape, transpose_a, transpose_b, M0, N0, K0, export_rhs_to_cl_image,
+                                    data_type);
+        _reference = compute_reference(lhs_shape, rhs_shape, output_shape, transpose_a, transpose_b, data_type);
+    }
+
+protected:
+    TensorType compute_target(TensorShape &shape_a,
+                              TensorShape &shape_b,
+                              bool         transpose_a,
+                              bool         transpose_b,
+                              int          M0,
+                              int          N0,
+                              int          K0,
+                              bool         export_rhs_to_cl_image,
+                              DataType     data_type)
+    {
+        ARM_COMPUTE_UNUSED(export_rhs_to_cl_image);
+        CLScheduler::get().default_reinit();
+
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch sketch{&context};
+
+        // Create sketch tensors
+        ITensorInfo *lhs_info = context.create_tensor_info(TensorInfo(shape_a, 1, data_type));
+        ITensorInfo *rhs_info = context.create_tensor_info(TensorInfo(shape_b, 1, data_type));
+        ITensorInfo *dst_info = context.create_tensor_info();
+
+        MatMulAttributes matmul_attr{};
+        matmul_attr.adj_lhs(transpose_a);
+        matmul_attr.adj_rhs(transpose_b);
+
+        GpuMatMulSettings matmul_settings{};
+        matmul_settings.m0(M0);
+        matmul_settings.n0(N0);
+        matmul_settings.k0(K0);
+
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, lhs_info, rhs_info, matmul_attr, matmul_settings);
+        GpuOutput::create_op(sketch, ans_info, dst_info);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+
+        // Construct user tensors
+        TensorType t_lhs{};
+        TensorType t_rhs{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_lhs.allocator()->init(*lhs_info);
+        t_rhs.allocator()->init(*rhs_info);
+        t_dst.allocator()->init(*dst_info);
+
+        ARM_COMPUTE_ASSERT(t_lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(t_rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(t_dst.info()->is_resizable());
+
+        // Allocate and fill user tensors
+        t_lhs.allocator()->allocate();
+        t_rhs.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!t_lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!t_rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!t_dst.info()->is_resizable());
+
+        fill(AccessorType(t_lhs), 0);
+        fill(AccessorType(t_rhs), 1);
+
+        // Run runtime
+        runtime.run({&t_lhs, &t_rhs, &t_dst});
+
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape_a,
+                                      const TensorShape &shape_b,
+                                      const TensorShape &output_shape,
+                                      bool               pretranspose_a,
+                                      bool               pretranspose_b,
+                                      DataType           data_type)
+    {
+        // We collapse dimensions > 3 onto dimension 3, i.e. 5D+ tensors will look like 3D
+        // This is necessary unless we choose to extend gemm reference for 5D+ tensors
+        TensorShape output_shape_collapsed = output_shape.collapsed_from(Window::DimZ);
+        TensorShape shape_a_collapsed      = shape_a.collapsed_from(Window::DimZ);
+        TensorShape shape_b_collapsed      = shape_b.collapsed_from(Window::DimZ);
+
+        // Create reference
+        SimpleTensor<T> a{shape_a_collapsed, data_type, 1};
+        SimpleTensor<T> b{shape_b_collapsed, data_type, 1};
+        SimpleTensor<T> c{output_shape_collapsed, data_type, 1};
+
+        // Fill reference
+        fill(a, 0);
+        fill(b, 1);
+
+        /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M),
+           therefore, A must be pre-transposed before passing it to the fixture. And, we transpose A again in the fixture to make it (B x M x K)
+           in order to be able to call reference implementation that works with (B x M x K) input.
+           Similarly, if pretranspose_B is set to true, then B is assumed to be (B x N x K), B must be pre-transposed before passing it to the fixture. */
+
+        // Define transposed shapes
+        TensorShape a_transposed_shape(a.shape());
+        a_transposed_shape.set(0, a.shape().y());
+        a_transposed_shape.set(1, a.shape().x());
+
+        TensorShape b_transposed_shape(b.shape());
+        b_transposed_shape.set(0, b.shape().y());
+        b_transposed_shape.set(1, b.shape().x());
+
+        // Define transposed tensors
+        SimpleTensor<T> a_transposed{a_transposed_shape, data_type};
+        SimpleTensor<T> b_transposed{b_transposed_shape, data_type};
+
+        //pretranspose a if necessary
+        if (pretranspose_a)
+        {
+            a_transposed = reference::permute<T>(a, PermutationVector(1U, 0U));
+        }
+
+        // pretranspose b if necessary
+        if (pretranspose_b)
+        {
+            b_transposed = reference::permute<T>(b, PermutationVector(1U, 0U));
+        }
+
+        // Use transposed tensors if boolean enabled else use original tensors
+        SimpleTensor<T> result =
+            reference::gemm<T>((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, 1.0f, 0.f);
+
+        // We reshape the gemm output back if the tensor is high dimensional
+        if (output_shape_collapsed != output_shape)
+        {
+            // std::cout << "called reshape: \n";
+            result = reference::reshape_layer(result, output_shape);
+        }
+
+        return result;
+    }
+
+    CLTensor        _target{};
+    SimpleTensor<T> _reference{};
+    bool            _device_supports_export_to_cl_image{false};
+    bool            _device_supports_mmul{false};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuMatMulValidationFixture
+    : public DynamicFusionGpuMatMulValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape lhs_shape,
+               TensorShape rhs_shape,
+               TensorShape output_shape,
+               bool        transpose_a,
+               bool        transpose_b,
+               int         M0,
+               int         N0,
+               int         K0,
+               bool        export_rhs_to_cl_image,
+               DataType    data_type)
+    {
+        ARM_COMPUTE_UNUSED(export_rhs_to_cl_image);
+        DynamicFusionGpuMatMulValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            lhs_shape, rhs_shape, output_shape, transpose_a, transpose_b, M0, N0, K0,
+            false /* export_rhs_to_cl_image bias */, data_type);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h
new file mode 100644
index 0000000000..b0c7143d91
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
+#include "src/dynamic_fusion/utils/Utils.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/PoolingLayer.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuPool2dValidationGenericFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape input_shape, const Pool2dAttributes &pool_attr, DataType data_type)
+    {
+        _target    = compute_target(input_shape, pool_attr, data_type);
+        _reference = compute_reference(
+            input_shape, convert_pool_attr_to_pool_info(pool_attr, true /* mixed_precision */), data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch (tensor.data_type())
+        {
+            case DataType::F16:
+            {
+                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{-1.0f, 1.0f};
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+                library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    // Given input is in nchw format
+    TensorType compute_target(TensorShape input_shape, const Pool2dAttributes &pool_attr, const DataType data_type)
+    {
+        CLScheduler::get().default_reinit();
+
+        // Change shape due to NHWC data layout, test shapes are NCHW
+        permute(input_shape, PermutationVector(2U, 0U, 1U));
+
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch sketch{&context};
+
+        // Create sketch tensors
+        auto input_info = context.create_tensor_info(TensorInfo(input_shape, 1, data_type, DataLayout::NHWC));
+        auto dst_info   = context.create_tensor_info();
+
+        // Create Pool2dSettings
+        GpuPool2dSettings pool_settings = GpuPool2dSettings();
+
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, input_info, pool_attr, pool_settings);
+        GpuOutput::create_op(sketch, ans_info, dst_info);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+        // (Important) Allocate auxiliary tensor memory if there are any
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+        // Construct user tensors
+        TensorType t_input{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_input.allocator()->init(*input_info);
+        t_dst.allocator()->init(*dst_info);
+
+        // Allocate and fill user tensors
+        t_input.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        fill(AccessorType(t_input), 0);
+
+        // Run runtime
+        runtime.run({&t_input, &t_dst});
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src(shape, data_type, 1, QuantizationInfo());
+        // Fill reference
+        fill(src, 0);
+        return reference::pooling_layer<T>(src, pool_info, QuantizationInfo(), nullptr, DataLayout::NCHW);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuPool2dValidationFixture
+    : public DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape,
+               PoolingType pool_type,
+               Size2D      pool_size,
+               Padding2D   pad,
+               Size2D      stride,
+               bool        exclude_padding,
+               DataType    data_type)
+    {
+        DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            input_shape,
+            Pool2dAttributes().pool_type(pool_type).pool_size(pool_size).pad(pad).stride(stride).exclude_padding(
+                exclude_padding),
+            data_type);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuPool2dSpecialValidationFixture
+    : public DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape input_shape, Pool2dAttributes pool_attr, DataType data_type)
+    {
+        DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            input_shape, pool_attr, data_type);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/operators/ActivationFixture.h b/tests/validation/fixtures/dynamic_fusion/operators/ActivationFixture.h
new file mode 100644
index 0000000000..c9ffbccbc7
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/operators/ActivationFixture.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_ACTIVATIONFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_ACTIVATIONFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/ActivationLayer.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename... TArgs>
+class DynamicFusionActivationValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape shape, bool fuse, DataType data_type, ActivationLayerInfo act_info, TArgs... args)
+    {
+        _fuse      = fuse;
+        _data_type = data_type;
+        _function  = act_info.activation();
+        _target    = compute_target(shape, args...);
+        _reference = compute_reference(shape, act_info);
+    }
+
+protected:
+    std::vector<T> get_boundary_values(T min, T max)
+    {
+        // This function will return a vector filled with the following values that can
+        // represent two partitions derived from equivalent partitioning.
+        // * Lower partition: min, min + delta, lower quarter (nominal), center - delta
+        // * Upper partition: center, center + delta, upper quarter (nominal), max - delta, max
+        const auto delta         = is_data_type_float(_data_type) ? T(0.1f) : T(1);
+        const auto center_value  = (min + max) / 2;
+        const auto lower_quarter = (min + center_value) / 2;
+        const auto upper_quarter = (center_value + max) / 2;
+
+        std::vector<T> boundary_values{};
+
+        // To ensure all the inserted values are within the given range after subtracing/adding delta
+        auto insert_values = [&boundary_values, &min, &max](const std::initializer_list<T> &new_values)
+        {
+            for (auto &v : new_values)
+            {
+                if (v >= min && v <= max)
+                {
+                    boundary_values.emplace_back(v);
+                }
+            }
+        };
+
+        insert_values({min, static_cast<T>(min + delta), static_cast<T>(lower_quarter),
+                       static_cast<T>(center_value - delta)}); // lower partition
+        insert_values({static_cast<T>(center_value), static_cast<T>(center_value + delta),
+                       static_cast<T>(upper_quarter), static_cast<T>(max - delta), max}); // upper partition
+
+        return boundary_values;
+    }
+
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        float min_bound                = 0;
+        float max_bound                = 0;
+        std::tie(min_bound, max_bound) = get_activation_layer_test_bounds<T>(_function, _data_type);
+        library->fill_static_values(tensor, get_boundary_values(static_cast<T>(min_bound), static_cast<T>(max_bound)));
+    }
+
+    TensorType compute_target(const TensorShape &shape, TArgs... args)
+    {
+        // Create a new workload sketch
+        CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        GpuWorkloadContext context{&cl_compile_ctx};
+        GpuWorkloadSketch  sketch{&context};
+
+        // Create sketch tensors
+        ITensorInfo *src_info = context.create_tensor_info(TensorInfo(shape, 1, _data_type));
+        ITensorInfo *dst_info = context.create_tensor_info(TensorInfo(shape, 1, _data_type));
+
+        ITensorInfo *ans_0_info = FunctionType::create_op(sketch, src_info, args...);
+        if (_fuse)
+        {
+            ITensorInfo *ans_1_info = FunctionType::create_op(sketch, ans_0_info, args...);
+            GpuOutput::create_op(sketch, ans_1_info, dst_info);
+        }
+        else
+        {
+            GpuOutput::create_op(sketch, ans_0_info, dst_info);
+        }
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        // Construct user tensors
+        TensorType t_src{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_src.allocator()->init(*src_info);
+        t_dst.allocator()->init(*dst_info);
+
+        // Allocate and fill user tensors
+        t_src.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        fill(AccessorType(t_src));
+
+        // Run runtime
+        runtime.run({&t_src, &t_dst});
+
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape, ActivationLayerInfo act_info)
+    {
+        // Create reference
+        SimpleTensor<T> src{shape, _data_type, 1};
+
+        // Fill reference
+        fill(src);
+
+        auto tmp = reference::activation_layer<T>(src, act_info);
+
+        if (_fuse)
+        {
+            auto dst = reference::activation_layer<T>(tmp, act_info);
+            return dst;
+        }
+        else
+        {
+            return tmp;
+        }
+    }
+
+protected:
+    ActivationLayerInfo::ActivationFunction _function{};
+    bool                                    _fuse{false};
+    DataType                                _data_type{};
+    TensorType                              _target{};
+    SimpleTensor<T>                         _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionSigmoidValidationFixture
+    : public DynamicFusionActivationValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape shape, bool fuse, DataType data_type)
+    {
+        ActivationLayerInfo act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
+        DynamicFusionActivationValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, fuse,
+                                                                                                   data_type, act_info);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionTanhValidationFixture
+    : public DynamicFusionActivationValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape shape, bool fuse, DataType data_type)
+    {
+        ActivationLayerInfo act_info{ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
+        DynamicFusionActivationValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, fuse,
+                                                                                                   data_type, act_info);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_ACTIVATIONFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/operators/CastFixture.h b/tests/validation/fixtures/dynamic_fusion/operators/CastFixture.h
new file mode 100644
index 0000000000..08fffb305b
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/operators/CastFixture.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_CASTFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_CASTFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/DepthConvertLayer.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
+class DynamicFusionCastValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape shape, DataType dt_in, DataType dt_out, ConvertPolicy policy)
+    {
+        _target    = compute_target(shape, dt_in, dt_out, policy);
+        _reference = compute_reference(shape, dt_in, dt_out, policy);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, DataType dt_in, DataType dt_out)
+    {
+        // Restricting range to avoid inf values
+        if (dt_out == DataType::F16)
+        {
+            constexpr int signed_min   = -32000;
+            constexpr int signed_max   = 32000;
+            constexpr int unsigned_min = 0;
+            constexpr int unsigned_max = 65000;
+
+            switch (dt_in)
+            {
+                case DataType::U8:
+                case DataType::QASYMM8:
+                case DataType::QASYMM8_SIGNED:
+                case DataType::S8:
+                case DataType::F32:
+                {
+                    library->fill_tensor_uniform(tensor, i);
+                    break;
+                }
+                case DataType::U16:
+                {
+                    library->fill_tensor_uniform(tensor, i, static_cast<uint16_t>(unsigned_min),
+                                                 static_cast<uint16_t>(unsigned_max));
+                    break;
+                }
+                case DataType::S16:
+                {
+                    library->fill_tensor_uniform(tensor, i, static_cast<int16_t>(signed_min),
+                                                 static_cast<int16_t>(signed_max));
+                    break;
+                }
+                case DataType::U32:
+                {
+                    library->fill_tensor_uniform(tensor, i, static_cast<uint32_t>(unsigned_min),
+                                                 static_cast<uint32_t>(unsigned_max));
+                    break;
+                }
+                case DataType::S32:
+                {
+                    library->fill_tensor_uniform(tensor, i, static_cast<int32_t>(signed_min),
+                                                 static_cast<int32_t>(signed_max));
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+            }
+        }
+        else
+        {
+            library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    // Given input is in nchw format
+    TensorType
+    compute_target(const TensorShape &shape, const DataType dt_in, const DataType dt_out, const ConvertPolicy policy)
+    {
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch sketch{&context};
+
+        // Create sketch tensors
+        // Here, we use DataLayout::NCHW just for the test. However, the optimal data layout to
+        // be used with dynamic fusion is NHWC
+        ITensorInfo *src_info =
+            context.create_tensor_info(TensorInfo(shape, 1, dt_in, DataLayout::NCHW)); // layout is not important
+        ITensorInfo *dst_info = context.create_tensor_info();
+
+        CastAttributes attributes;
+        attributes.convert_policy(policy).data_type(dt_out);
+
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, src_info, attributes);
+        GpuOutput::create_op(sketch, ans_info, dst_info);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        // (Important) Allocate auxiliary tensor memory if there are any
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+
+        // Construct user tensors
+        TensorType t_src{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_src.allocator()->init(*src_info);
+        t_dst.allocator()->init(*dst_info);
+
+        // Allocate and fill user tensors
+        t_src.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        fill(AccessorType(t_src), 0, dt_in, dt_out);
+
+        // Run runtime
+        runtime.run({&t_src, &t_dst});
+        return t_dst;
+    }
+
+    SimpleTensor<T2>
+    compute_reference(const TensorShape &shape, const DataType dt_in, const DataType dt_out, const ConvertPolicy policy)
+    {
+        // Create reference
+        SimpleTensor<T1> src{shape, dt_in, 1};
+
+        // Fill reference
+        fill(src, 0, dt_in, dt_out);
+
+        return reference::depth_convert<T1, T2>(src, dt_out, policy, 0);
+    }
+
+    TensorType       _target{};
+    SimpleTensor<T2> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_CASTFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/operators/ClampFixture.h b/tests/validation/fixtures/dynamic_fusion/operators/ClampFixture.h
new file mode 100644
index 0000000000..e8f6f83e42
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/operators/ClampFixture.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_CLAMPFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_CLAMPFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/ActivationLayer.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionClampValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape shape, ClampAttributes attributes, bool fuse, DataType data_type)
+    {
+        // CLAMP is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped.
+        ActivationLayerInfo act_info{ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val() };
+
+        _fuse       = fuse;
+        _attributes = attributes;
+        _data_type  = data_type;
+        _target     = compute_target(shape, attributes);
+        _reference  = compute_reference(shape, act_info);
+    }
+
+protected:
+    std::vector<T> get_boundary_values(T min, T max)
+    {
+        // This function will return a vector filled with the following values that can
+        // represent two partitions derived from equivalent partitioning.
+        // * Lower partition: min, min + delta, lower quarter (nominal), center - delta
+        // * Upper partition: center, center + delta, upper quarter (nominal), max - delta, max
+        const auto delta         = is_data_type_float(_data_type) ? T(0.1f) : T(1);
+        const auto center_value  = (min + max) / 2;
+        const auto lower_quarter = (min + center_value) / 2;
+        const auto upper_quarter = (center_value + max) / 2;
+
+        std::vector<T> boundary_values{};
+
+        // To ensure all the inserted values are within the given range after subtracing/adding delta
+        auto insert_values = [&boundary_values, &min, &max](const std::initializer_list<T> &new_values)
+        {
+            for(auto &v : new_values)
+            {
+                if(v >= min && v <= max)
+                {
+                    boundary_values.emplace_back(v);
+                }
+            }
+        };
+
+        insert_values({ min, static_cast<T>(min + delta), static_cast<T>(lower_quarter), static_cast<T>(center_value - delta) });                               // lower partition
+        insert_values({ static_cast<T>(center_value), static_cast<T>(center_value + delta), static_cast<T>(upper_quarter), static_cast<T>(max - delta), max }); // upper partition
+
+        return boundary_values;
+    }
+
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        float min_bound = 0;
+        float max_bound = 0;
+        std::tie(min_bound, max_bound) = get_activation_layer_test_bounds<T>(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, _data_type);
+        library->fill_static_values(tensor, get_boundary_values(static_cast<T>(min_bound), static_cast<T>(max_bound)));
+    }
+
+    TensorType compute_target(const TensorShape &shape, ClampAttributes attributes)
+    {
+        // Create a new workload sketch
+        CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        GpuWorkloadContext context{ &cl_compile_ctx };
+        GpuWorkloadSketch  sketch{ &context };
+
+        // Create sketch tensors
+        ITensorInfo* src_info = context.create_tensor_info(TensorInfo(shape, 1, _data_type));
+        ITensorInfo* dst_info = context.create_tensor_info(TensorInfo(shape, 1, _data_type));
+
+        ITensorInfo *ans_0_info = FunctionType::create_op(sketch, src_info, attributes);
+        if(_fuse)
+        {
+            ITensorInfo *ans_1_info = FunctionType::create_op(sketch, ans_0_info, attributes);
+            GpuOutput::create_op(sketch, ans_1_info, dst_info);
+        }
+        else
+        {
+            GpuOutput::create_op(sketch, ans_0_info, dst_info);
+        }
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        // Construct user tensors
+        TensorType t_src{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_src.allocator()->init(*src_info);
+        t_dst.allocator()->init(*dst_info);
+
+        // Allocate and fill user tensors
+        t_src.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        fill(AccessorType(t_src));
+
+        // Run runtime
+        runtime.run({ &t_src, &t_dst });
+
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape, ActivationLayerInfo act_info)
+    {
+        // Create reference
+        SimpleTensor<T> src{ shape, _data_type, 1, _quantization_info };
+
+        // Fill reference
+        fill(src);
+
+        auto dst = reference::activation_layer<T>(src, act_info, _quantization_info);
+        return dst;
+    }
+
+protected:
+    QuantizationInfo _quantization_info{};
+    ClampAttributes  _attributes{};
+    bool             _fuse{ false };
+    DataType         _data_type{};
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_CLAMPFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/operators/MulFixture.h b/tests/validation/fixtures/dynamic_fusion/operators/MulFixture.h
new file mode 100644
index 0000000000..f02aa5e36a
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/operators/MulFixture.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_MULFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_MULFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/Globals.h"
+#include "tests/validation/reference/PixelWiseMultiplication.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/* We use a separate test fixture for Multiplication op instead of reusing ElementwiseBinaryFixture to avoid exposing
+ * the internal enum ElementwiseOp to the public utils/TypePrinters.h as required by the data test case macros
+ * to print the test data.
+ */
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionMulValidationFixture : public framework::Fixture
+{
+public:
+    void setup(const TensorShape &shape0,
+               const TensorShape &shape1,
+               const TensorShape &shape2,
+               DataType           data_type,
+               bool               is_inplace,
+               bool               fuse_two_ops = false)
+    {
+        _data_type  = data_type;
+        _is_inplace = is_inplace;
+        _fuse       = fuse_two_ops;
+        ARM_COMPUTE_ERROR_ON_MSG(_fuse && shape2.total_size() == 0, "No shape2 provided for fusion of two ops.");
+        ARM_COMPUTE_ERROR_ON_MSG(_fuse && _is_inplace, "In place for fusing case not supported yet.");
+        _target    = compute_target(shape0, shape1, shape2);
+        _reference = compute_reference(shape0, shape1, shape2);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        library->fill_tensor_uniform(tensor, i);
+    }
+
+    TensorType compute_target(const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2)
+    {
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch sketch{&context};
+
+        // Fuse first multiplication op
+        ITensorInfo *lhs_info = context.create_tensor_info(TensorInfo(shape0, 1, _data_type));
+        ITensorInfo *rhs_info = context.create_tensor_info(TensorInfo(shape1, 1, _data_type));
+        ITensorInfo *dst_info = context.create_tensor_info();
+
+        ITensorInfo *rhs_info_fuse = nullptr;
+
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, lhs_info, rhs_info);
+
+        if (_fuse)
+        {
+            rhs_info_fuse          = context.create_tensor_info(TensorInfo(shape2, 1, _data_type));
+            ITensorInfo *ans2_info = FunctionType::create_op(sketch, ans_info, rhs_info_fuse);
+            GpuOutput::create_op(sketch, ans2_info, dst_info);
+        }
+        else
+        {
+            GpuOutput::create_op(sketch, ans_info, dst_info);
+        }
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        // (Important) Allocate auxiliary tensor memory if there are any
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+
+        // Construct user tensors
+        TensorType t_lhs{};
+        TensorType t_rhs{};
+        TensorType t_rhs_fuse{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_lhs.allocator()->init(*lhs_info);
+        t_rhs.allocator()->init(*rhs_info);
+        t_dst.allocator()->init(*dst_info);
+        if (_fuse)
+        {
+            t_rhs_fuse.allocator()->init(*rhs_info_fuse);
+        }
+
+        // Allocate and fill user tensors
+        // Instead of using ACL allocator, the user can choose to import memory into the tensors
+        t_lhs.allocator()->allocate();
+        t_rhs.allocator()->allocate();
+        t_dst.allocator()->allocate();
+        if (_fuse)
+        {
+            t_rhs_fuse.allocator()->allocate();
+        }
+
+        fill(AccessorType(t_lhs), 0);
+        fill(AccessorType(t_rhs), 1);
+        if (_fuse)
+        {
+            fill(AccessorType(t_rhs_fuse), 2);
+        }
+
+        // Run runtime
+        if (_fuse)
+        {
+            runtime.run({&t_lhs, &t_rhs, &t_rhs_fuse, &t_dst});
+        }
+        else
+        {
+            runtime.run({&t_lhs, &t_rhs, &t_dst});
+        }
+
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2)
+    {
+        // Create reference
+        SimpleTensor<T> ref_lhs{shape0, _data_type, 1, QuantizationInfo()};
+        SimpleTensor<T> ref_rhs{shape1, _data_type, 1, QuantizationInfo()};
+        SimpleTensor<T> ref_rhs_fuse{shape2, _data_type, 1, QuantizationInfo()};
+
+        // Fill reference
+        fill(ref_lhs, 0);
+        fill(ref_rhs, 1);
+        SimpleTensor<T> ref_dst = reference::pixel_wise_multiplication<T, T, T>(
+            ref_lhs, ref_rhs, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_UP, _data_type,
+            QuantizationInfo());
+        if (_fuse)
+        {
+            fill(ref_rhs_fuse, 2);
+            SimpleTensor<T> ref_dst_fuse = reference::pixel_wise_multiplication<T, T, T>(
+                ref_dst, ref_rhs_fuse, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_UP, _data_type,
+                QuantizationInfo());
+            return ref_dst_fuse;
+        }
+        return ref_dst;
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+    DataType        _data_type{};
+    bool            _is_inplace{false};
+    bool            _fuse{false};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionMulOneOpValidationFixture
+    : public DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape0, DataType data_type, bool is_inplace)
+    {
+        DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            shape0, shape0, TensorShape(), data_type, is_inplace);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionMulBroadcastValidationFixture
+    : public DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type, bool is_inplace)
+    {
+        DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            shape0, shape1, TensorShape(), data_type, is_inplace);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionMulTwoOpsValidationFixture
+    : public DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(const TensorShape &shape0,
+               const TensorShape &shape1,
+               const TensorShape &shape2,
+               DataType           data_type,
+               bool               is_inplace,
+               bool               fuse_two_ops)
+    {
+        DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            shape0, shape1, shape2, data_type, is_inplace, fuse_two_ops);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_MULFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/operators/ReshapeFixture.h b/tests/validation/fixtures/dynamic_fusion/operators/ReshapeFixture.h
new file mode 100644
index 0000000000..bde3360940
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/operators/ReshapeFixture.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_RESHAPEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_RESHAPEFIXTURE_H
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/ReshapeAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h"
+
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/Globals.h"
+#include "tests/validation/reference/ReshapeLayer.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuReshapeLayerValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    {
+        _target    = compute_target(input_shape, output_shape, data_type);
+        _reference = compute_reference(input_shape, output_shape, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        library->fill_tensor_uniform(tensor, i);
+    }
+
+    TensorType compute_target(TensorShape &input_shape, TensorShape &output_shape, DataType data_type)
+    {
+        // Check if indeed the input shape can be reshape to the output one
+        ARM_COMPUTE_ASSERT(input_shape.total_size() == output_shape.total_size());
+
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch sketch{&context};
+
+        // Create sketch tensors
+        ITensorInfo      *src_info = context.create_tensor_info(TensorInfo(input_shape, 1, data_type));
+        ITensorInfo      *dst_info = context.create_tensor_info(TensorInfo(output_shape, 1, data_type));
+        ReshapeAttributes attributes;
+        attributes.shape(output_shape);
+
+        ITensorInfo *ans_info = FunctionType::create_op(sketch, src_info, attributes);
+        GpuOutput::create_op(sketch, ans_info, dst_info);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        // (Important) Allocate auxiliary tensor memory if there are any
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+
+        // Construct user tensors
+        TensorType t_src{};
+        TensorType t_dst{};
+        // Initialize user tensors
+        t_src.allocator()->init(*src_info);
+        t_dst.allocator()->init(*dst_info);
+
+        // Allocate and fill user tensors
+        t_src.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        fill(AccessorType(t_src), 0);
+
+        // Run runtime
+        runtime.run({&t_src, &t_dst});
+
+        return t_dst;
+    }
+
+    SimpleTensor<T>
+    compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{input_shape, data_type};
+
+        // Fill reference
+        fill(src, 0);
+
+        return reference::reshape_layer<T>(src, output_shape);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+/** [ReshapeLayer fixture] **/
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_RESHAPEFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/operators/ResizeFixture.h b/tests/validation/fixtures/dynamic_fusion/operators/ResizeFixture.h
new file mode 100644
index 0000000000..711767b66f
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/operators/ResizeFixture.h
@@ -0,0 +1,272 @@
+/*
+* Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_RESIZEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_RESIZEFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/SimpleTensor.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/Scale.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionResizeGenericValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape         shape,
+               DataType            data_type,
+               QuantizationInfo    quantization_info,
+               DataLayout          data_layout,
+               InterpolationPolicy interpolation_policy,
+               SamplingPolicy      sampling_policy,
+               bool                align_corners,
+               QuantizationInfo    output_quantization_info)
+    {
+        _shape                    = shape;
+        _interpolation_policy     = interpolation_policy;
+        _sampling_policy          = sampling_policy;
+        _data_type                = data_type;
+        _input_quantization_info  = quantization_info;
+        _output_quantization_info = output_quantization_info;
+        _align_corners            = align_corners;
+        _data_layout              = data_layout;
+
+        ARM_COMPUTE_ERROR_ON(data_layout != DataLayout::NHWC); // Dynamic fusion resize supports only NHWC layout
+
+        generate_scale(shape);
+
+        std::mt19937                            generator(library->seed());
+        std::uniform_int_distribution<uint32_t> distribution_u8(0, 255);
+
+        _target    = compute_target(shape);
+        _reference = compute_reference(shape);
+    }
+
+protected:
+    void generate_scale(const TensorShape &shape)
+    {
+        static constexpr float _min_scale{0.25f};
+        static constexpr float _max_scale{3.f};
+
+        constexpr float max_width{8192.0f};
+        constexpr float max_height{6384.0f};
+        constexpr float min_width{1.f};
+        constexpr float min_height{1.f};
+
+        std::mt19937                          generator(library->seed());
+        std::uniform_real_distribution<float> distribution_float(_min_scale, _max_scale);
+
+        auto generate = [&](size_t input_size, float min_output, float max_output) -> int
+        {
+            const float generated_scale = distribution_float(generator);
+            const int   output_size     = static_cast<int>(
+                utility::clamp(static_cast<float>(input_size) * generated_scale, min_output, max_output));
+            return output_size;
+        };
+
+        // Input shape is always given in NCHW layout. NHWC is dealt by permute in compute_target()
+        const int idx_width  = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::WIDTH);
+        const int idx_height = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::HEIGHT);
+
+        _output_width  = generate(shape[idx_width], min_width, max_width);
+        _output_height = generate(shape[idx_height], min_height, max_height);
+    }
+
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        if (tensor.data_type() == DataType::F32)
+        {
+            std::uniform_real_distribution<float> distribution(-5.0f, 5.0f);
+            library->fill(tensor, distribution, 0);
+        }
+        else if (tensor.data_type() == DataType::F16)
+        {
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{-5.0f, 5.0f};
+            library->fill(tensor, distribution, 0);
+        }
+        else if (is_data_type_quantized(tensor.data_type()))
+        {
+            std::uniform_int_distribution<> distribution(0, 100);
+            library->fill(tensor, distribution, 0);
+        }
+        else
+        {
+            library->fill_tensor_uniform(tensor, 0);
+        }
+    }
+
+    TensorType compute_target(TensorShape shape)
+    {
+        // Our test shapes are assumed in NCHW data layout, thus the permutation
+        permute(shape, PermutationVector(2U, 0U, 1U));
+
+        // Create a new workload sketch
+        CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch  sketch{&context};
+
+        // Create sketch tensors
+        ITensorInfo *src_info = context.create_tensor_info(TensorInfo(shape, 1, _data_type, _data_layout));
+        src_info->set_quantization_info(_input_quantization_info);
+        ITensorInfo *dst_info = context.create_tensor_info();
+
+        ResizeAttributes attributes;
+        attributes.align_corners(_align_corners)
+            .sampling_policy(_sampling_policy)
+            .interpolation_policy(_interpolation_policy)
+            .output_width(_output_width)
+            .output_height(_output_height);
+
+        ITensorInfo *scale_result_info = FunctionType::create_op(sketch, src_info, attributes);
+        GpuOutput::create_op(sketch, scale_result_info, dst_info);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        // (Important) Allocate auxiliary tensor memory if there are any
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+
+        // Construct user tensors
+        TensorType t_src{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_src.allocator()->init(*src_info);
+        t_dst.allocator()->init(*dst_info);
+
+        // Allocate and fill user tensors
+        t_src.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        fill(AccessorType(t_src));
+
+        // Run runtime
+        runtime.run({&t_src, &t_dst});
+
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape)
+    {
+        // Create reference
+        SimpleTensor<T> src{shape, _data_type, 1, _input_quantization_info};
+
+        // Reference code is NCHW, so the input shapes are NCHW
+        const int idx_width  = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::WIDTH);
+        const int idx_height = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::HEIGHT);
+
+        const float scale_x = static_cast<float>(_output_width) / shape[idx_width];
+        const float scale_y = static_cast<float>(_output_height) / shape[idx_height];
+
+        // Fill reference
+        fill(src);
+
+        return reference::scale<T>(src, scale_x, scale_y, _interpolation_policy, BorderMode::REPLICATE,
+                                   static_cast<T>(0), _sampling_policy, /* ceil_policy_scale */ false, _align_corners,
+                                   _output_quantization_info);
+    }
+
+    TensorType          _target{};
+    SimpleTensor<T>     _reference{};
+    TensorShape         _shape{};
+    InterpolationPolicy _interpolation_policy{};
+    SamplingPolicy      _sampling_policy{};
+    DataType            _data_type{};
+    DataLayout          _data_layout{};
+    QuantizationInfo    _input_quantization_info{};
+    QuantizationInfo    _output_quantization_info{};
+    bool                _align_corners{false};
+    int                 _output_width{0};
+    int                 _output_height{0};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionResizeValidationFixture
+    : public DynamicFusionResizeGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape         shape,
+               DataType            data_type,
+               DataLayout          data_layout,
+               InterpolationPolicy policy,
+               SamplingPolicy      sampling_policy,
+               bool                align_corners)
+    {
+        DynamicFusionResizeGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            shape, data_type, QuantizationInfo(), data_layout, policy, sampling_policy, align_corners,
+            QuantizationInfo());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false>
+class DynamicFusionResizeQuantizedValidationFixture
+    : public DynamicFusionResizeGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape         shape,
+               DataType            data_type,
+               QuantizationInfo    quantization_info,
+               DataLayout          data_layout,
+               InterpolationPolicy policy,
+               SamplingPolicy      sampling_policy,
+               bool                align_corners)
+    {
+        DynamicFusionResizeGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            shape, data_type, quantization_info, data_layout, policy, sampling_policy, align_corners,
+            quantization_info);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_RESIZEFIXTURE_H
diff --git a/tests/validation/fixtures/dynamic_fusion/operators/SoftmaxFixture.h b/tests/validation/fixtures/dynamic_fusion/operators/SoftmaxFixture.h
new file mode 100644
index 0000000000..175d4ff889
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/operators/SoftmaxFixture.h
@@ -0,0 +1,158 @@
+/*
+* Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_SOFTMAXFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_SOFTMAXFIXTURE_H
+
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/SimpleTensor.h"
+#include "tests/validation/reference/SoftmaxLayer.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionSoftmaxValidationGenericFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape shape, DataType data_type, float beta, size_t axis, bool is_log)
+    {
+        _reference = compute_reference(shape, data_type, beta, axis, is_log);
+        _target    = compute_target(shape, data_type, beta, axis, is_log);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        if (tensor.data_type() == DataType::F32)
+        {
+            std::uniform_real_distribution<float> distribution(-10.0f, 10.0f);
+            library->fill(tensor, distribution, 0);
+        }
+        else if (tensor.data_type() == DataType::F16)
+        {
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{-10.0f, 10.0f};
+            library->fill(tensor, distribution, 0);
+        }
+        else if (!is_data_type_quantized(tensor.data_type()))
+        {
+            std::uniform_int_distribution<> distribution(0, 100);
+            library->fill(tensor, distribution, 0);
+        }
+        else
+        {
+            library->fill_tensor_uniform(tensor, 0);
+        }
+    }
+
+    TensorType compute_target(const TensorShape &shape, DataType data_type, float beta, int32_t axis, bool is_log)
+    {
+        // Create a new workload sketch
+        CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch  sketch{&context};
+
+        SoftmaxAttributes softmax_attr{};
+        softmax_attr.axis(axis).beta(beta).is_log_softmax(is_log);
+        ITensorInfo *src_info = context.create_tensor_info(shape, 1, data_type);
+        ITensorInfo *dst_info = context.create_tensor_info(shape, 1, data_type);
+        FunctionType::create_op(sketch, src_info, dst_info, softmax_attr);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        // (Important) Allocate auxiliary tensor memory if there are any
+        // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+        for (auto &data : runtime.get_auxiliary_tensors())
+        {
+            CLTensor     *tensor      = std::get<0>(data);
+            TensorInfo    info        = std::get<1>(data);
+            AuxMemoryInfo aux_mem_req = std::get<2>(data);
+            tensor->allocator()->init(info, aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+        // Construct user tensors
+        TensorType src{};
+        TensorType dst{};
+
+        // Initialize user tensors
+        src.allocator()->init(*src_info);
+        dst.allocator()->init(*dst_info);
+
+        // Allocate and fill user tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+        fill(AccessorType(src));
+
+        // Run runtime
+        runtime.run({&src, &dst});
+
+        return dst;
+    }
+
+    SimpleTensor<T>
+    compute_reference(const TensorShape &shape, DataType data_type, float beta, int32_t axis, bool is_log)
+    {
+        // Create reference
+        SimpleTensor<T> src{shape, data_type, 1};
+
+        // Fill reference
+        fill(src);
+
+        return reference::softmax_layer<T>(src, beta, axis, is_log);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionSoftmaxValidationFixture
+    : public DynamicFusionSoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    void setup(TensorShape shape, DataType data_type, float beta, size_t axis, bool is_log)
+    {
+        DynamicFusionSoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            shape, data_type, beta, axis, is_log);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_SOFTMAXFIXTURE_H
diff --git a/tests/validation/gpu/unit/Context.cpp b/tests/validation/gpu/unit/Context.cpp
index 06b4a83925..598e219fd9 100644
--- a/tests/validation/gpu/unit/Context.cpp
+++ b/tests/validation/gpu/unit/Context.cpp
@@ -21,11 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-
-#include "arm_compute/Acl.hpp"
+#include "tests/validation/fixtures/UNIT/ContextFixture.h"
 
 #include "src/gpu/cl/ClContext.h"
 
@@ -41,67 +37,9 @@ TEST_SUITE(CL)
 TEST_SUITE(UNIT)
 TEST_SUITE(Context)
 
-/** Test-case for AclCreateContext and AclDestroy Context
- *
- * Validate that AclCreateContext can create and destroy a context
- *
- * Test Steps:
- *  - Call AclCreateContext with valid target
- *  - Confirm that context is not nullptr and error code is AclSuccess
- *  - Destroy context
- *  - Confirm that AclSuccess is reported
- */
-TEST_CASE(SimpleContextCApi, framework::DatasetMode::ALL)
-{
-    AclContext ctx = nullptr;
-    ARM_COMPUTE_ASSERT(AclCreateContext(&ctx, AclGpuOcl, nullptr) == AclStatus::AclSuccess);
-    ARM_COMPUTE_ASSERT(ctx != nullptr);
-    ARM_COMPUTE_ASSERT(AclDestroyContext(ctx) == AclStatus::AclSuccess);
-}
-
-/** Test-case for Context from the C++ interface
- *
- * Test Steps:
- *  - Create a Context obejct
- *  - Confirm that StatusCode::Success is reported
- *  - Confirm that equality operator works
- *  - Confirm that inequality operator works
- */
-TEST_CASE(SimpleContextCppApi, framework::DatasetMode::ALL)
-{
-    acl::StatusCode status = acl::StatusCode::Success;
-    acl::Context    ctx(acl::Target::GpuOcl, &status);
-    ARM_COMPUTE_ASSERT(status == acl::StatusCode::Success);
-
-    auto ctx_eq = ctx;
-    ARM_COMPUTE_ASSERT(ctx_eq == ctx);
-
-    acl::Context ctx_ienq(acl::Target::GpuOcl, &status);
-    ARM_COMPUTE_ASSERT(status == acl::StatusCode::Success);
-    ARM_COMPUTE_ASSERT(ctx_ienq != ctx);
-}
-
-/** Test-case for CpuCapabilities
- *
- * Validate that AclCreateContext can create/destroy multiple contexts with different options
- *
- * Test Steps:
- *  - Call AclCreateContext with different targets
- *  - Confirm that AclSuccess is reported
- *  - Destroy all contexts
- *  - Confirm that AclSuccess is reported
- */
-TEST_CASE(MultipleContexts, framework::DatasetMode::ALL)
-{
-    const unsigned int num_tests = 5;
-    std::array<AclContext, num_tests> ctxs{};
-    for(unsigned int i = 0; i < num_tests; ++i)
-    {
-        ARM_COMPUTE_ASSERT(AclCreateContext(&ctxs[i], AclTarget::AclGpuOcl, nullptr) == AclStatus::AclSuccess);
-        ARM_COMPUTE_ASSERT(ctxs[i] != nullptr);
-        ARM_COMPUTE_ASSERT(AclDestroyContext(ctxs[i]) == AclStatus::AclSuccess);
-    }
-}
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleContextCApi, SimpleContextCApiFixture<AclTarget::AclGpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleContextCppApi, SimpleContextCppApiFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MultipleContexts, MultipleContextsFixture<AclTarget::AclGpuOcl>, framework::DatasetMode::ALL)
 
 /** Test-case for MLGO kernel configuration file
  *
@@ -148,9 +86,9 @@ TEST_CASE(CheckMLGO, framework::DatasetMode::ALL)
     ofs << mlgo_str;
     ofs.close();
 
-    AclContextOptions opts  = acl_default_ctx_options;
-    opts.kernel_config_file = mlgo_filename.c_str();
-    arm_compute::gpu::opencl::ClContext ctx(&opts);
+    acl::Context::Options opts;
+    opts.copts.kernel_config_file = mlgo_filename.c_str();
+    arm_compute::gpu::opencl::ClContext ctx(&opts.copts);
 
     const MLGOHeuristics &heuristics = ctx.mlgo();
 
diff --git a/tests/validation/gpu/unit/Queue.cpp b/tests/validation/gpu/unit/Queue.cpp
new file mode 100644
index 0000000000..8154a7954f
--- /dev/null
+++ b/tests/validation/gpu/unit/Queue.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "tests/validation/fixtures/UNIT/QueueFixture.h"
+
+#include "arm_compute/AclOpenClExt.h"
+#include "src/gpu/cl/ClQueue.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(Queue)
+
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateQueueWithInvalidContext, CreateQueueWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateQueuerWithInvalidOptions, CreateQueuerWithInvalidOptionsFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidQueue, DestroyInvalidQueueFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleQueue, SimpleQueueFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+
+TEST_CASE(KhrQueuePriorities, framework::DatasetMode::ALL)
+{
+    acl::StatusCode err = acl::StatusCode::Success;
+
+    acl::Context ctx(acl::Target::GpuOcl, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    acl::Queue queue(ctx, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    cl_device_id cl_dev;
+    auto         status = AclGetClDevice(ctx.get(), &cl_dev);
+    ARM_COMPUTE_ASSERT(status == AclSuccess);
+
+    std::string extensions = cl::Device(cl_dev).getInfo<CL_DEVICE_EXTENSIONS>();
+    if(extensions.find("cl_khr_priority_hints") != std::string::npos)
+    {
+        cl_int error = CL_SUCCESS;
+
+        cl_context cl_ctx;
+        auto       status = AclGetClContext(ctx.get(), &cl_ctx);
+        ARM_COMPUTE_ASSERT(status == AclSuccess);
+
+        /* Check a queue with high priority */
+        cl_queue_properties queue_properties[] = { CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_HIGH_KHR, 0 };
+        cl_command_queue    priority_queue     = clCreateCommandQueueWithProperties(cl_ctx, cl_dev, queue_properties, &error);
+        ARM_COMPUTE_ASSERT(error == CL_SUCCESS);
+
+        clReleaseCommandQueue(priority_queue);
+    }
+}
+
+TEST_SUITE_END() // Queue
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/gpu/unit/Tensor.cpp b/tests/validation/gpu/unit/Tensor.cpp
new file mode 100644
index 0000000000..18102733ce
--- /dev/null
+++ b/tests/validation/gpu/unit/Tensor.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "tests/validation/fixtures/UNIT/TensorFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(Tensor)
+
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorWithInvalidContext, CreateTensorWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorWithInvalidDescriptor, CreateTensorWithInvalidDescriptorFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidTensor, DestroyInvalidTensorFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleTensor, SimpleTensorFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(TensorStress, TensorStressFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MapInvalidTensor, MapInvalidTensorFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MapAllocatedTensor, MapAllocatedTensorFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetSize, TensorSizeFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetInvalidSize, InvalidTensorSizeFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetDescriptor, DescriptorConversionFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetInvalidDescriptor, InvalidDescriptorConversionFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+
+TEST_SUITE_END() // Tensor
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/gpu/unit/TensorPack.cpp b/tests/validation/gpu/unit/TensorPack.cpp
new file mode 100644
index 0000000000..b62426d056
--- /dev/null
+++ b/tests/validation/gpu/unit/TensorPack.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "tests/validation/fixtures/UNIT/TensorPackFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(TensorPack)
+
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorPackWithInvalidContext, CreateTensorPackWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidTensorPack, DestroyInvalidTensorPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(AddInvalidObjectToTensorPack, AddInvalidObjectToTensorPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleTensorPack, SimpleTensorPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MultipleTensorsInPack, MultipleTensorsInPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+
+TEST_SUITE_END() // Tensor
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ActivationLayer.cpp b/tests/validation/reference/ActivationLayer.cpp
index 664b969125..2172362bdd 100644
--- a/tests/validation/reference/ActivationLayer.cpp
+++ b/tests/validation/reference/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "ActivationLayer.h"
 
 #include "arm_compute/core/Types.h"
+
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -40,7 +41,7 @@ SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo
     ARM_COMPUTE_UNUSED(oq_info);
 
     // Create reference
-    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
+    SimpleTensor<T> dst{src.shape(), src.data_type(), 1};
 
     // Compute reference
     const T a(info.a());
@@ -48,7 +49,7 @@ SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo
 #if defined(_OPENMP)
     #pragma omp parallel for
 #endif /* _OPENMP */
-    for(int i = 0; i < src.num_elements(); ++i)
+    for (int i = 0; i < src.num_elements(); ++i)
     {
         dst[i] = activate_float<T>(src[i], a, b, info.activation());
     }
@@ -57,7 +58,8 @@ SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo
 }
 
 template <>
-SimpleTensor<uint8_t> activation_layer<uint8_t>(const SimpleTensor<uint8_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
+SimpleTensor<uint8_t>
+activation_layer<uint8_t>(const SimpleTensor<uint8_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
 {
     const QuantizationInfo dst_qinfo = oq_info.empty() ? src.quantization_info() : oq_info;
 
@@ -68,7 +70,8 @@ SimpleTensor<uint8_t> activation_layer<uint8_t>(const SimpleTensor<uint8_t> &src
 }
 
 template <>
-SimpleTensor<int8_t> activation_layer<int8_t>(const SimpleTensor<int8_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
+SimpleTensor<int8_t>
+activation_layer<int8_t>(const SimpleTensor<int8_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
 {
     const QuantizationInfo dst_qinfo = oq_info.empty() ? src.quantization_info() : oq_info;
 
@@ -79,7 +82,8 @@ SimpleTensor<int8_t> activation_layer<int8_t>(const SimpleTensor<int8_t> &src, A
 }
 
 template <>
-SimpleTensor<int16_t> activation_layer<int16_t>(const SimpleTensor<int16_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
+SimpleTensor<int16_t>
+activation_layer<int16_t>(const SimpleTensor<int16_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
 {
     const QuantizationInfo dst_qinfo = oq_info.empty() ? src.quantization_info() : oq_info;
 
@@ -88,9 +92,14 @@ SimpleTensor<int16_t> activation_layer<int16_t>(const SimpleTensor<int16_t> &src
     SimpleTensor<int16_t> dst     = convert_to_symmetric<int16_t>(dst_tmp, dst_qinfo);
     return dst;
 }
-template SimpleTensor<int32_t> activation_layer(const SimpleTensor<int32_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
-template SimpleTensor<float> activation_layer(const SimpleTensor<float> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
-template SimpleTensor<half> activation_layer(const SimpleTensor<half> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
+template SimpleTensor<int32_t>
+activation_layer(const SimpleTensor<int32_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
+template SimpleTensor<float>
+activation_layer(const SimpleTensor<float> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
+template SimpleTensor<half>
+activation_layer(const SimpleTensor<half> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
+template SimpleTensor<bfloat16>
+activation_layer(const SimpleTensor<bfloat16> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ActivationLayer.h b/tests/validation/reference/ActivationLayer.h
index 8aad1af63e..7f896bd696 100644
--- a/tests/validation/reference/ActivationLayer.h
+++ b/tests/validation/reference/ActivationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020,2022,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_ACTIVATION_LAYER_H
-#define ARM_COMPUTE_TEST_ACTIVATION_LAYER_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_ACTIVATIONLAYER_H
+#define ACL_TESTS_VALIDATION_REFERENCE_ACTIVATIONLAYER_H
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -40,7 +40,7 @@ inline T activate_float(T x, T a, T b, ActivationLayerInfo::ActivationFunction a
 {
     T ret;
 
-    switch(activation)
+    switch (activation)
     {
         case ActivationLayerInfo::ActivationFunction::ABS:
             ret = std::abs(x);
@@ -61,13 +61,13 @@ inline T activate_float(T x, T a, T b, ActivationLayerInfo::ActivationFunction a
             ret = std::min<T>(a, std::max<T>(b, x));
             break;
         case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-            ret = (x > 0) ? x : a * x;
+            ret = x > static_cast<T>(0) ? x : static_cast<T>(a * x);
             break;
         case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-            ret = std::log(static_cast<T>(1) + std::exp(x));
+            ret = std::log(static_cast<T>(1) + std::exp(static_cast<double>(x)));
             break;
         case ActivationLayerInfo::ActivationFunction::ELU:
-            ret = (x > 0) ? x : a * (std::exp(x) - static_cast<T>(1));
+            ret = x > static_cast<T>(0) ? x : static_cast<T>(a * (std::exp(x) - static_cast<T>(1)));
             break;
         case ActivationLayerInfo::ActivationFunction::SQRT:
             ret = std::sqrt(x);
@@ -82,7 +82,14 @@ inline T activate_float(T x, T a, T b, ActivationLayerInfo::ActivationFunction a
             ret = x;
             break;
         case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-            ret = x * ((std::min(std::max(static_cast<T>(x + 3), static_cast<T>(0.0f)), static_cast<T>(6.0f))) * 0.166666667f);
+            ret = x * ((std::min(std::max(static_cast<T>(x + 3), static_cast<T>(0.0f)), static_cast<T>(6.0f))) *
+                       0.166666667f);
+            break;
+        case ActivationLayerInfo::ActivationFunction::SWISH:
+            ret = static_cast<T>(x) / (static_cast<T>(1) + std::exp(-a * x));
+            break;
+        case ActivationLayerInfo::ActivationFunction::GELU:
+            ret = x * 0.5f * (1 + erf(x / std::sqrt(2.0f)));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported activation function");
@@ -93,9 +100,11 @@ inline T activate_float(T x, T a, T b, ActivationLayerInfo::ActivationFunction a
 }
 
 template <typename T>
-SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info = QuantizationInfo());
+SimpleTensor<T> activation_layer(const SimpleTensor<T>  &src,
+                                 ActivationLayerInfo     info,
+                                 const QuantizationInfo &oq_info = QuantizationInfo());
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_ACTIVATION_LAYER_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_ACTIVATIONLAYER_H
diff --git a/tests/validation/reference/BatchToSpaceLayer.cpp b/tests/validation/reference/BatchToSpaceLayer.cpp
index 404ee73cac..63d121f59b 100644
--- a/tests/validation/reference/BatchToSpaceLayer.cpp
+++ b/tests/validation/reference/BatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,10 @@
  */
 #include "BatchToSpaceLayer.h"
 
+#include "arm_compute/core/Validate.h"
 #include "tests/validation/Helpers.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 namespace arm_compute
 {
 namespace test
@@ -35,32 +37,37 @@ namespace reference
 {
 // Batch to Space
 template <typename T>
-SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape)
+SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, const TensorShape &dst_shape)
 {
-    ARM_COMPUTE_ERROR_ON(block_shape[0] <= 0);
-    ARM_COMPUTE_ERROR_ON(block_shape[1] <= 0);
-    SimpleTensor<T> result(dst_shape, src.data_type());
+    ARM_COMPUTE_ERROR_ON(block_shape[0] < 1);
+    ARM_COMPUTE_ERROR_ON(block_shape[1] < 1);
+    const auto expected_dst_shape = misc::shape_calculator::compute_batch_to_space_shape(DataLayout::NCHW, src.shape(), block_shape[0], block_shape[1], crop_info);
+    ARM_COMPUTE_ERROR_ON(arm_compute::detail::have_different_dimensions(expected_dst_shape, dst_shape, 0));
+    ARM_COMPUTE_UNUSED(expected_dst_shape);
 
-    int        in_pos    = 0;
-    const auto width_in  = static_cast<int>(src.shape()[0]);
-    const auto height_in = static_cast<int>(src.shape()[1]);
-    const auto z_in      = static_cast<int>(src.shape()[2]);
-    const auto batch_in  = static_cast<int>(src.shape()[3]);
+    SimpleTensor<T> result(dst_shape, src.data_type());
+    int             out_pos    = 0;
+    const auto      width_out  = static_cast<int>(dst_shape[0]);
+    const auto      height_out = static_cast<int>(dst_shape[1]);
+    const auto      z_out      = static_cast<int>(dst_shape[2]);
+    const auto      batch_out  = static_cast<int>(dst_shape[3]);
 
-    for(int batch = 0; batch < batch_in; ++batch)
+    for(int batch = 0; batch < batch_out; ++batch)
     {
-        for(int z = 0; z < z_in; ++z)
+        for(int z = 0; z < z_out; ++z)
         {
-            for(int y = 0; y < height_in; ++y)
+            for(int y = 0; y < height_out; ++y)
             {
-                for(int x = 0; x < width_in; ++x)
+                for(int x = 0; x < width_out; ++x)
                 {
-                    const int r       = src.shape()[3] / (block_shape[0] * block_shape[1]);
-                    const int out_x   = (block_shape[0] * x + (batch / r) % block_shape[0]);
-                    const int out_y   = (block_shape[1] * y + (batch / r) / block_shape[0]);
-                    const int out_pos = out_x + dst_shape[0] * out_y + z * dst_shape[0] * dst_shape[1] + (batch % r) * dst_shape[0] * dst_shape[1] * dst_shape[2];
-                    result[out_pos]   = src[in_pos];
-                    ++in_pos;
+                    const int x_c      = x + crop_info.left;
+                    const int y_c      = y + crop_info.top;
+                    const int in_batch = batch + ((x_c % block_shape[0]) + (y_c % block_shape[1]) * (block_shape[0])) * dst_shape[3];
+                    const int in_x     = x_c / block_shape[0];
+                    const int in_y     = y_c / block_shape[1];
+                    const int in_pos   = in_x + src.shape()[0] * in_y + z * src.shape()[0] * src.shape()[1] + in_batch * src.shape()[0] * src.shape()[1] * src.shape()[2];
+                    result[out_pos]    = src[in_pos];
+                    ++out_pos;
                 }
             }
         }
@@ -68,8 +75,8 @@ SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const SimpleTensor<in
 
     return result;
 }
-template SimpleTensor<float> batch_to_space(const SimpleTensor<float> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape);
-template SimpleTensor<half> batch_to_space(const SimpleTensor<half> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape);
+template SimpleTensor<float> batch_to_space(const SimpleTensor<float> &src, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, const TensorShape &dst_shape);
+template SimpleTensor<half> batch_to_space(const SimpleTensor<half> &src, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, const TensorShape &dst_shape);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/BatchToSpaceLayer.h b/tests/validation/reference/BatchToSpaceLayer.h
index 52556cb53f..a37bfc3373 100644
--- a/tests/validation/reference/BatchToSpaceLayer.h
+++ b/tests/validation/reference/BatchToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_TEST_BATCH_TO_SPACE_LAYER_H
 #define ARM_COMPUTE_TEST_BATCH_TO_SPACE_LAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
 
@@ -36,7 +37,7 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape);
+SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, const TensorShape &dst_shape);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Box3x3.cpp b/tests/validation/reference/Box3x3.cpp
deleted file mode 100644
index ccc7f1b3b9..0000000000
--- a/tests/validation/reference/Box3x3.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-
-#include "Box3x3.h"
-#include "Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> box3x3(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value)
-{
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-    const std::array<T, 9> filter{ { 1, 1, 1, 1, 1, 1, 1, 1, 1 } };
-    const float    scale        = 1.f / static_cast<float>(filter.size());
-    const uint32_t num_elements = src.num_elements();
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
-    {
-        const Coordinates id = index2coord(src.shape(), element_idx);
-        apply_2d_spatial_filter(id, src, dst, TensorShape(3U, 3U), filter.data(), scale, border_mode, constant_border_value);
-    }
-    return dst;
-}
-
-template SimpleTensor<uint8_t> box3x3(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/CannyEdgeDetector.cpp b/tests/validation/reference/CannyEdgeDetector.cpp
deleted file mode 100644
index aa2351ddd8..0000000000
--- a/tests/validation/reference/CannyEdgeDetector.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "CannyEdgeDetector.h"
-
-#include "Utils.h"
-#include "support/ToolchainSupport.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/Magnitude.h"
-#include "tests/validation/reference/NonMaximaSuppression.h"
-#include "tests/validation/reference/Phase.h"
-#include "tests/validation/reference/Sobel.h"
-
-#include <cmath>
-#include <stack>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-const auto MARK_ZERO  = 0u;
-const auto MARK_MAYBE = 127u;
-const auto MARK_EDGE  = 255u;
-
-template <typename T>
-void trace_edge(SimpleTensor<T> &dst, const ValidRegion &valid_region)
-{
-    std::stack<Coordinates> pixels_stack;
-    for(auto i = 0; i < dst.num_elements(); ++i)
-    {
-        if(dst[i] == MARK_EDGE)
-        {
-            pixels_stack.push(index2coord(dst.shape(), i));
-        }
-    }
-
-    while(!pixels_stack.empty())
-    {
-        const Coordinates pixel_coord = pixels_stack.top();
-        pixels_stack.pop();
-
-        std::array<Coordinates, 8> neighbours =
-        {
-            {
-                Coordinates(pixel_coord.x() - 1, pixel_coord.y() + 0),
-                Coordinates(pixel_coord.x() + 1, pixel_coord.y() + 0),
-                Coordinates(pixel_coord.x() - 1, pixel_coord.y() - 1),
-                Coordinates(pixel_coord.x() + 1, pixel_coord.y() + 1),
-                Coordinates(pixel_coord.x() + 0, pixel_coord.y() - 1),
-                Coordinates(pixel_coord.x() + 0, pixel_coord.y() + 1),
-                Coordinates(pixel_coord.x() + 1, pixel_coord.y() - 1),
-                Coordinates(pixel_coord.x() - 1, pixel_coord.y() + 1)
-            }
-        };
-
-        // Mark MAYBE neighbours as edges since they are next to an EDGE
-        std::for_each(neighbours.begin(), neighbours.end(), [&](Coordinates & coord)
-        {
-            if(is_in_valid_region(valid_region, coord))
-            {
-                const size_t pixel_index = coord2index(dst.shape(), coord);
-                const T      pixel       = dst[pixel_index];
-                if(pixel == MARK_MAYBE)
-                {
-                    dst[pixel_index] = MARK_EDGE;
-                    pixels_stack.push(coord);
-                }
-            }
-        });
-    }
-
-    // Mark all remaining MAYBE pixels as ZERO (not edges)
-    for(auto i = 0; i < dst.num_elements(); ++i)
-    {
-        if(dst[i] == MARK_MAYBE)
-        {
-            dst[i] = MARK_ZERO;
-        }
-    }
-}
-
-template <typename U, typename T>
-SimpleTensor<T> canny_edge_detector_impl(const SimpleTensor<T> &src, int32_t upper, int32_t lower, int gradient_size, MagnitudeType norm_type,
-                                         BorderMode border_mode, T constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(gradient_size != 3 && gradient_size != 5 && gradient_size != 7);
-    ARM_COMPUTE_ERROR_ON(lower < 0 || lower >= upper);
-
-    // Output: T == uint8_t
-    SimpleTensor<T> dst{ src.shape(), src.data_type() };
-    ValidRegion     valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(gradient_size / 2 + 1));
-
-    // Sobel computation: U == int16_t or int32_t
-    SimpleTensor<U> gx{};
-    SimpleTensor<U> gy{};
-    std::tie(gx, gy) = sobel<U>(src, gradient_size, border_mode, constant_border_value, GradientDimension::GRAD_XY);
-
-    using unsigned_U = typename traits::make_unsigned_conditional_t<U>::type;
-    using promoted_U = typename common_promoted_signed_type<U>::intermediate_type;
-
-    // Gradient magnitude and phase (edge direction)
-    const DataType           mag_data_type = gx.data_type() == DataType::S16 ? DataType::U16 : DataType::U32;
-    SimpleTensor<unsigned_U> grad_mag{ gx.shape(), mag_data_type };
-    SimpleTensor<uint8_t>    grad_dir{ gy.shape(), DataType::U8 };
-
-    for(auto i = 0; i < grad_mag.num_elements(); ++i)
-    {
-        double mag = 0.f;
-
-        if(norm_type == MagnitudeType::L2NORM)
-        {
-            mag = support::cpp11::round(std::sqrt(static_cast<promoted_U>(gx[i]) * gx[i] + static_cast<promoted_U>(gy[i]) * gy[i]));
-        }
-        else // MagnitudeType::L1NORM
-        {
-            mag = static_cast<promoted_U>(std::abs(gx[i])) + static_cast<promoted_U>(std::abs(gy[i]));
-        }
-
-        float angle = 180.f * std::atan2(static_cast<float>(gy[i]), static_cast<float>(gx[i])) / M_PI;
-        grad_dir[i] = support::cpp11::round(angle < 0.f ? 180 + angle : angle);
-        grad_mag[i] = saturate_cast<unsigned_U>(mag);
-    }
-
-    /*
-        Quantise the phase into 4 directions
-          0°  dir=0    0.0 <= p <  22.5 or 157.5 <= p < 180
-         45°  dir=1   22.5 <= p <  67.5
-         90°  dir=2   67.5 <= p < 112.5
-        135°  dir=3  112.5 <= p < 157.5
-    */
-    for(auto i = 0; i < grad_dir.num_elements(); ++i)
-    {
-        const auto direction = std::fabs(grad_dir[i]);
-        grad_dir[i]          = (direction < 22.5 || direction >= 157.5) ? 0 : (direction < 67.5) ? 1 : (direction < 112.5) ? 2 : 3;
-    }
-
-    // Non-maximum suppression
-    std::vector<int> strong_edges;
-    const auto       upper_thresh = static_cast<uint32_t>(upper);
-    const auto       lower_thresh = static_cast<uint32_t>(lower);
-
-    const auto pixel_at_offset = [&](const SimpleTensor<unsigned_U> &tensor, const Coordinates & coord, int xoffset, int yoffset)
-    {
-        return tensor_elem_at(tensor, Coordinates{ coord.x() + xoffset, coord.y() + yoffset }, border_mode, static_cast<unsigned_U>(constant_border_value));
-    };
-
-    for(auto i = 0; i < dst.num_elements(); ++i)
-    {
-        const auto coord = index2coord(dst.shape(), i);
-        if(!is_in_valid_region(valid_region, coord) || grad_mag[i] <= lower_thresh)
-        {
-            dst[i] = MARK_ZERO;
-            continue;
-        }
-
-        unsigned_U mag_90;
-        unsigned_U mag90;
-        switch(grad_dir[i])
-        {
-            case 0: // North/South edge direction, compare against East/West pixels (left & right)
-                mag_90 = pixel_at_offset(grad_mag, coord, -1, 0);
-                mag90  = pixel_at_offset(grad_mag, coord, 1, 0);
-                break;
-            case 1: // NE/SW edge direction, compare against NW/SE pixels (top-left & bottom-right)
-                mag_90 = pixel_at_offset(grad_mag, coord, -1, -1);
-                mag90  = pixel_at_offset(grad_mag, coord, +1, +1);
-                break;
-            case 2: // East/West edge direction, compare against North/South pixels (top & bottom)
-                mag_90 = pixel_at_offset(grad_mag, coord, 0, -1);
-                mag90  = pixel_at_offset(grad_mag, coord, 0, +1);
-                break;
-            case 3: // NW/SE edge direction, compare against NE/SW pixels (top-right & bottom-left)
-                mag_90 = pixel_at_offset(grad_mag, coord, +1, -1);
-                mag90  = pixel_at_offset(grad_mag, coord, -1, +1);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Invalid gradient phase provided");
-                break;
-        }
-
-        // Potential edge if greater than both pixels at +/-90° on either side
-        if(grad_mag[i] > mag_90 && grad_mag[i] > mag90)
-        {
-            // Double thresholding and edge tracing
-            if(grad_mag[i] > upper_thresh)
-            {
-                dst[i] = MARK_EDGE; // Definite edge pixel
-                strong_edges.emplace_back(i);
-            }
-            else
-            {
-                dst[i] = MARK_MAYBE;
-            }
-        }
-        else
-        {
-            dst[i] = MARK_ZERO; // Since not greater than neighbours
-        }
-    }
-
-    // Final edge tracing
-    trace_edge<T>(dst, valid_region);
-    return dst;
-}
-} // namespace
-
-template <typename T>
-SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src,
-                                    int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
-                                    BorderMode border_mode, T constant_border_value)
-{
-    if(gradient_size < 7)
-    {
-        return canny_edge_detector_impl<int16_t>(src, upper_thresh, lower_thresh, gradient_size, norm_type, border_mode, constant_border_value);
-    }
-    else
-    {
-        return canny_edge_detector_impl<int32_t>(src, upper_thresh, lower_thresh, gradient_size, norm_type, border_mode, constant_border_value);
-    }
-}
-
-template SimpleTensor<uint8_t> canny_edge_detector(const SimpleTensor<uint8_t> &src,
-                                                   int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
-                                                   BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/CannyEdgeDetector.h b/tests/validation/reference/CannyEdgeDetector.h
deleted file mode 100644
index e05895ab95..0000000000
--- a/tests/validation/reference/CannyEdgeDetector.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_CANNY_EDGE_DETECTOR_H
-#define ARM_COMPUTE_TEST_CANNY_EDGE_DETECTOR_H
-
-#include "arm_compute/core/Types.h"
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src,
-                                    int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
-                                    BorderMode border_mode, T constant_border_value = 0);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CANNY_EDGE_DETECTOR_H */
diff --git a/tests/validation/reference/ChannelCombine.cpp b/tests/validation/reference/ChannelCombine.cpp
deleted file mode 100644
index dcd4cf551b..0000000000
--- a/tests/validation/reference/ChannelCombine.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ChannelCombine.h"
-
-#include "arm_compute/core/Types.h"
-#include "tests/validation/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-template <typename T>
-inline std::vector<SimpleTensor<T>> create_image_planes(const TensorShape &shape, Format format)
-{
-    TensorShape image_shape = adjust_odd_shape(shape, format);
-
-    std::vector<SimpleTensor<T>> image_planes;
-
-    switch(format)
-    {
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-        {
-            image_planes.emplace_back(image_shape, format);
-            break;
-        }
-        case Format::NV12:
-        case Format::NV21:
-        {
-            TensorShape shape_uv88 = calculate_subsampled_shape(image_shape, Format::UV88);
-
-            image_planes.emplace_back(image_shape, Format::U8);
-            image_planes.emplace_back(shape_uv88, Format::UV88);
-            break;
-        }
-        case Format::IYUV:
-        {
-            TensorShape shape_sub2 = calculate_subsampled_shape(image_shape, Format::IYUV);
-
-            image_planes.emplace_back(image_shape, Format::U8);
-            image_planes.emplace_back(shape_sub2, Format::U8);
-            image_planes.emplace_back(shape_sub2, Format::U8);
-            break;
-        }
-        case Format::YUV444:
-        {
-            image_planes.emplace_back(image_shape, Format::U8);
-            image_planes.emplace_back(image_shape, Format::U8);
-            image_planes.emplace_back(image_shape, Format::U8);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-
-    return image_planes;
-}
-} // namespace
-
-template <typename T>
-std::vector<SimpleTensor<T>> channel_combine(const TensorShape &shape, const std::vector<SimpleTensor<T>> &image_planes, Format format)
-{
-    std::vector<SimpleTensor<T>> dst = create_image_planes<T>(shape, format);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(unsigned int plane_idx = 0; plane_idx < dst.size(); ++plane_idx)
-    {
-        SimpleTensor<T> &dst_tensor   = dst[plane_idx];
-        const uint32_t   num_elements = dst_tensor.num_elements();
-
-        for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
-        {
-            Coordinates coord = index2coord(dst_tensor.shape(), element_idx);
-
-            switch(format)
-            {
-                case Format::RGB888:
-                case Format::RGBA8888:
-                {
-                    // Copy R/G/B or A channel
-                    for(int channel_idx = 0; channel_idx < dst_tensor.num_channels(); ++channel_idx)
-                    {
-                        const T &src_value = reinterpret_cast<const T *>(image_planes[channel_idx](coord))[0];
-                        T       &dst_value = reinterpret_cast<T *>(dst_tensor(coord))[channel_idx];
-
-                        dst_value = src_value;
-                    }
-                    break;
-                }
-                case Format::YUYV422:
-                case Format::UYVY422:
-                {
-                    // Find coordinates of the sub-sampled pixel
-                    const Coordinates coord_hori(coord.x() / 2, coord.y());
-
-                    const T &src0 = reinterpret_cast<const T *>(image_planes[0](coord))[0];
-                    const T &src1 = reinterpret_cast<const T *>(image_planes[1](coord_hori))[0];
-
-                    const int shift = (Format::YUYV422 == format) ? 1 : 0;
-                    T        &dst0  = reinterpret_cast<T *>(dst_tensor(coord))[1 - shift];
-                    T        &dst1  = reinterpret_cast<T *>(dst_tensor(coord))[0 + shift];
-
-                    dst0 = src0;
-                    dst1 = src1;
-
-                    Coordinates coord2 = index2coord(dst_tensor.shape(), ++element_idx);
-
-                    const T &src2 = reinterpret_cast<const T *>(image_planes[0](coord2))[0];
-                    const T &src3 = reinterpret_cast<const T *>(image_planes[2](coord_hori))[0];
-
-                    T &dst2 = reinterpret_cast<T *>(dst_tensor(coord2))[1 - shift];
-                    T &dst3 = reinterpret_cast<T *>(dst_tensor(coord2))[0 + shift];
-
-                    dst2 = src2;
-                    dst3 = src3;
-
-                    break;
-                }
-                case Format::NV12:
-                case Format::NV21:
-                {
-                    if(0U == plane_idx)
-                    {
-                        // Get and combine Y channel from plane0 of destination multi-image
-                        dst_tensor[element_idx] = image_planes[0][element_idx];
-                    }
-                    else
-                    {
-                        const int shift = (Format::NV12 == format) ? 0 : 1;
-
-                        // Get U channel from plane1 and V channel from plane2 of the source
-                        const T &src_u0 = reinterpret_cast<const T *>(image_planes[1](coord))[0];
-                        const T &src_v0 = reinterpret_cast<const T *>(image_planes[2](coord))[0];
-
-                        // Get U and V channel from plane1 of destination multi-image
-                        T &dst_u0 = reinterpret_cast<T *>(dst_tensor(coord))[0 + shift];
-                        T &dst_v0 = reinterpret_cast<T *>(dst_tensor(coord))[1 - shift];
-
-                        // Combine channel U and V
-                        dst_u0 = src_u0;
-                        dst_v0 = src_v0;
-                    }
-
-                    break;
-                }
-                case Format::IYUV:
-                case Format::YUV444:
-                {
-                    // Get Y/U/V element
-                    const T &src = reinterpret_cast<const T *>(image_planes[plane_idx](coord))[0];
-                    T       &dst = reinterpret_cast<T *>(dst_tensor(coord))[0];
-
-                    // Copy Y/U/V plane
-                    dst = src;
-
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-        }
-    }
-
-    return dst;
-}
-
-template std::vector<SimpleTensor<uint8_t>> channel_combine(const TensorShape &shape, const std::vector<SimpleTensor<uint8_t>> &image_planes, Format format);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/ChannelCombine.h b/tests/validation/reference/ChannelCombine.h
deleted file mode 100644
index 315e2d4dd4..0000000000
--- a/tests/validation/reference/ChannelCombine.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_CHANNEL_COMBINE_H
-#define ARM_COMPUTE_TEST_CHANNEL_COMBINE_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-std::vector<SimpleTensor<T>> channel_combine(const TensorShape &shape, const std::vector<SimpleTensor<T>> &image_planes, Format format);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CHANNEL_COMBINE_H */
diff --git a/tests/validation/reference/ChannelExtract.cpp b/tests/validation/reference/ChannelExtract.cpp
deleted file mode 100644
index 8674510269..0000000000
--- a/tests/validation/reference/ChannelExtract.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ChannelExtract.h"
-
-#include "arm_compute/core/Types.h"
-#include "tests/validation/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<uint8_t> channel_extract(const TensorShape &shape, const std::vector<SimpleTensor<T>> &tensor_planes, Format format, Channel channel)
-{
-    // Find plane and channel index
-    const unsigned int plane_idx   = plane_idx_from_channel(format, channel);
-    const unsigned int channel_idx = channel_idx_from_format(format, channel);
-
-    // Create dst and get src tensor
-    SimpleTensor<T> src = tensor_planes[plane_idx];
-    SimpleTensor<T> dst{ calculate_subsampled_shape(shape, format, channel), Format::U8 };
-
-    // Single planar formats with subsampling require a double horizontal step
-    const int step_x = ((Format::YUYV422 == format || Format::UYVY422 == format) && Channel::Y != channel) ? 2 : 1;
-    const int width  = dst.shape().x();
-    const int height = dst.shape().y();
-
-    // Loop over each pixel and extract channel
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(2)
-#endif /* _OPENMP */
-    for(int y = 0; y < height; ++y)
-    {
-        for(int x = 0; x < width; ++x)
-        {
-            const Coordinates src_coord{ x * step_x, y };
-            const Coordinates dst_coord{ x, y };
-
-            const auto *src_pixel = reinterpret_cast<const T *>(src(src_coord));
-            auto       *dst_pixel = reinterpret_cast<T *>(dst(dst_coord));
-
-            dst_pixel[0] = src_pixel[channel_idx]; // NOLINT
-        }
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint8_t> channel_extract(const TensorShape &shape, const std::vector<SimpleTensor<uint8_t>> &tensor_planes, Format format, Channel channel);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/ChannelExtract.h b/tests/validation/reference/ChannelExtract.h
deleted file mode 100644
index ce1e6732bd..0000000000
--- a/tests/validation/reference/ChannelExtract.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_CHANNEL_EXTRACT_H
-#define ARM_COMPUTE_TEST_CHANNEL_EXTRACT_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<uint8_t> channel_extract(const TensorShape &shape, const std::vector<SimpleTensor<T>> &tensor_planes, Format format, Channel channel);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_CHANNEL_EXTRACT_H */
diff --git a/tests/validation/reference/ColorConvert.cpp b/tests/validation/reference/ColorConvert.cpp
deleted file mode 100644
index c6a4630999..0000000000
--- a/tests/validation/reference/ColorConvert.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ColorConvert.h"
-
-#include "arm_compute/core/Types.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/ColorConvertHelper.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-template <typename T>
-inline std::vector<SimpleTensor<T>> create_image_planes(const TensorShape &shape, Format format)
-{
-    TensorShape image_shape = adjust_odd_shape(shape, format);
-
-    std::vector<SimpleTensor<T>> image_planes;
-
-    switch(format)
-    {
-        case Format::U8:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-        {
-            image_planes.emplace_back(image_shape, format);
-            break;
-        }
-        case Format::NV12:
-        case Format::NV21:
-        {
-            TensorShape shape_uv88 = calculate_subsampled_shape(image_shape, Format::UV88);
-
-            image_planes.emplace_back(image_shape, Format::U8);
-            image_planes.emplace_back(shape_uv88, Format::UV88);
-            break;
-        }
-        case Format::IYUV:
-        {
-            TensorShape shape_sub2 = calculate_subsampled_shape(image_shape, Format::IYUV);
-
-            image_planes.emplace_back(image_shape, Format::U8);
-            image_planes.emplace_back(shape_sub2, Format::U8);
-            image_planes.emplace_back(shape_sub2, Format::U8);
-            break;
-        }
-        case Format::YUV444:
-        {
-            image_planes.emplace_back(image_shape, Format::U8);
-            image_planes.emplace_back(image_shape, Format::U8);
-            image_planes.emplace_back(image_shape, Format::U8);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-
-    return image_planes;
-}
-} // namespace
-
-template <typename T>
-std::vector<SimpleTensor<T>> color_convert(const TensorShape &shape, const std::vector<SimpleTensor<T>> &tensor_planes, Format src_format, Format dst_format)
-{
-    std::vector<SimpleTensor<T>> dst = create_image_planes<T>(shape, dst_format);
-
-    switch(src_format)
-    {
-        case Format::RGB888:
-        {
-            switch(dst_format)
-            {
-                case Format::RGBA8888:
-                    colorconvert_helper::detail::colorconvert_rgb_to_rgbx(tensor_planes[0], dst[0]);
-                    break;
-                case Format::U8:
-                    colorconvert_helper::detail::colorconvert_rgb_to_u8(tensor_planes[0], dst[0]);
-                    break;
-                case Format::NV12:
-                    colorconvert_helper::detail::colorconvert_rgb_to_nv12(tensor_planes[0], dst);
-                    break;
-                case Format::IYUV:
-                    colorconvert_helper::detail::colorconvert_rgb_to_iyuv(tensor_planes[0], dst);
-                    break;
-                case Format::YUV444:
-                    colorconvert_helper::detail::colorconvert_rgb_to_yuv4(tensor_planes[0], dst);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not Supported");
-                    break;
-            }
-            break;
-        }
-        case Format::RGBA8888:
-        {
-            switch(dst_format)
-            {
-                case Format::RGB888:
-                    colorconvert_helper::detail::colorconvert_rgbx_to_rgb(tensor_planes[0], dst[0]);
-                    break;
-                case Format::NV12:
-                    colorconvert_helper::detail::colorconvert_rgb_to_nv12(tensor_planes[0], dst);
-                    break;
-                case Format::IYUV:
-                    colorconvert_helper::detail::colorconvert_rgb_to_iyuv(tensor_planes[0], dst);
-                    break;
-                case Format::YUV444:
-                    colorconvert_helper::detail::colorconvert_rgb_to_yuv4(tensor_planes[0], dst);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not Supported");
-                    break;
-            }
-            break;
-        }
-        case Format::UYVY422:
-        case Format::YUYV422:
-        {
-            switch(dst_format)
-            {
-                case Format::RGB888:
-                case Format::RGBA8888:
-                    colorconvert_helper::detail::colorconvert_yuyv_to_rgb(tensor_planes[0], src_format, dst[0]);
-                    break;
-                case Format::NV12:
-                    colorconvert_helper::detail::colorconvert_yuyv_to_nv12(tensor_planes[0], src_format, dst);
-                    break;
-                case Format::IYUV:
-                    colorconvert_helper::detail::colorconvert_yuyv_to_iyuv(tensor_planes[0], src_format, dst);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not Supported");
-                    break;
-            }
-            break;
-        }
-        case Format::IYUV:
-        {
-            switch(dst_format)
-            {
-                case Format::RGB888:
-                case Format::RGBA8888:
-                    colorconvert_helper::detail::colorconvert_iyuv_to_rgb(tensor_planes, dst[0]);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not Supported");
-                    break;
-            }
-            break;
-        }
-        case Format::NV12:
-        case Format::NV21:
-        {
-            switch(dst_format)
-            {
-                case Format::RGB888:
-                case Format::RGBA8888:
-                    colorconvert_helper::detail::colorconvert_nv12_to_rgb(src_format, tensor_planes, dst[0]);
-                    break;
-                case Format::IYUV:
-                    colorconvert_helper::detail::colorconvert_nv_to_iyuv(tensor_planes, src_format, dst);
-                    break;
-                case Format::YUV444:
-                    colorconvert_helper::detail::colorconvert_nv_to_yuv4(tensor_planes, src_format, dst);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not Supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-    return dst;
-}
-
-template std::vector<SimpleTensor<uint8_t>> color_convert(const TensorShape &shape, const std::vector<SimpleTensor<uint8_t>> &tensor_planes, Format src_format, Format dst_format);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/ColorConvert.h b/tests/validation/reference/ColorConvert.h
deleted file mode 100644
index 28776cb85f..0000000000
--- a/tests/validation/reference/ColorConvert.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_COLOR_CONVERT_H
-#define ARM_COMPUTE_TEST_COLOR_CONVERT_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-std::vector<SimpleTensor<T>> color_convert(const TensorShape &shape, const std::vector<SimpleTensor<T>> &tensor_planes, Format src_format, Format dst_format);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_COLOR_CONVERT_H */
diff --git a/tests/validation/reference/Conv3D.cpp b/tests/validation/reference/Conv3D.cpp
new file mode 100644
index 0000000000..e4010a507a
--- /dev/null
+++ b/tests/validation/reference/Conv3D.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Conv3D.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "support/AclRequires.h"
+#include "tests/validation/reference/UtilsQuantizedAsymm.h"
+
+// Source/Destination Tensor shape indices (N D H W C)
+constexpr unsigned int batch_dim   = 4u;
+constexpr unsigned int depth_dim   = 3u;
+constexpr unsigned int height_dim  = 2u;
+constexpr unsigned int width_dim   = 1u;
+constexpr unsigned int channel_dim = 0u;
+
+// Weight tensor shape indices (D H W Cin Cout)
+constexpr unsigned int weights_depth_dim  = 4u;
+constexpr unsigned int weights_height_dim = 3u;
+constexpr unsigned int weights_width_dim  = 2u;
+constexpr unsigned int weights_CHin_dim   = 1u;
+constexpr unsigned int weights_CHout_dim  = 0u;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+inline bool is_valid_pixel(int i, int min, int max)
+{
+    return (i >= min && i < max);
+}
+
+// Evaluate the weights against an element in a given tensor.
+template < typename T, typename TB, typename std::enable_if < validation::is_floating_point<T>::value &&validation::is_floating_point<TB>::value, int >::type = 0 >
+T calculate_conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const Size3D &dilation, int batch,
+                   int z_start, int y_start, int x_start, int ch_out, UniformQuantizationInfo oq_info)
+{
+    ARM_COMPUTE_UNUSED(oq_info);
+
+    const unsigned int weights_width  = weights.shape()[weights_width_dim];
+    const unsigned int weights_height = weights.shape()[weights_height_dim];
+    const unsigned int weights_depth  = weights.shape()[weights_depth_dim];
+
+    const unsigned int src_channels = src.shape()[channel_dim];
+    const unsigned int src_width    = src.shape()[width_dim];
+    const unsigned int src_height   = src.shape()[height_dim];
+    const unsigned int src_depth    = src.shape()[depth_dim];
+
+    T total(0);
+    for(unsigned int weight_d = 0; weight_d < weights_depth; ++weight_d)
+    {
+        const int idx_z = z_start + dilation.depth * weight_d;
+        for(unsigned int weight_y = 0; weight_y < weights_height; ++weight_y)
+        {
+            const int idx_y = y_start + dilation.height * weight_y;
+            for(unsigned int weight_x = 0; weight_x < weights_width; ++weight_x)
+            {
+                const int idx_x = x_start + dilation.width * weight_x;
+
+                //Check if the point is within padding
+                const bool is_x_valid       = is_valid_pixel(idx_x, 0, src_width);
+                const bool is_y_valid       = is_valid_pixel(idx_y, 0, src_height);
+                const bool is_z_valid       = is_valid_pixel(idx_z, 0, src_depth);
+                const bool is_invalid_pixel = !(is_x_valid && is_y_valid && is_z_valid);
+                if(is_invalid_pixel)
+                {
+                    continue;
+                }
+
+                for(unsigned int ch_in = 0; ch_in < src_channels; ++ch_in)
+                {
+                    const T *in_ptr = src.data();
+                    const T *w_ptr  = weights.data();
+
+                    const int in_offset     = coord2index(src.shape(), Coordinates{ ch_in, idx_x, idx_y, idx_z, batch });
+                    const int weight_offset = coord2index(weights.shape(), Coordinates{ ch_out, ch_in, weight_x, weight_y, weight_d });
+                    T         input_value   = in_ptr[in_offset];
+                    T         weight_value  = w_ptr[weight_offset];
+                    total += (input_value * weight_value);
+                }
+            }
+        }
+    }
+
+    const TB *b_ptr      = bias.data();
+    TB        bias_value = b_ptr[ch_out];
+
+    return total + bias_value;
+}
+
+template < typename T, typename TB, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+T calculate_conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const Size3D &dilation, int batch,
+                   int z_start, int y_start, int x_start, int ch_out, UniformQuantizationInfo oq_info)
+{
+    const unsigned int weights_width  = weights.shape()[weights_width_dim];
+    const unsigned int weights_height = weights.shape()[weights_height_dim];
+    const unsigned int weights_depth  = weights.shape()[weights_depth_dim];
+
+    const unsigned int src_channels = src.shape()[channel_dim];
+    const unsigned int src_width    = src.shape()[width_dim];
+    const unsigned int src_height   = src.shape()[height_dim];
+    const unsigned int src_depth    = src.shape()[depth_dim];
+
+    const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
+    const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+
+    const int   input_offset   = -iq_info.offset;
+    const float input_scale    = iq_info.scale;
+    int         weights_offset = -wq_info.offset;
+    float       weights_scale  = wq_info.scale;
+    const int   output_offset  = oq_info.offset;
+    const float output_scale   = oq_info.scale;
+
+    int         output_multiplier = 0;
+    int         output_shift      = 0;
+    const float multiplier        = input_scale * weights_scale / output_scale;
+    arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+
+    int32_t total(0);
+    for(unsigned int weight_d = 0; weight_d < weights_depth; ++weight_d)
+    {
+        const int idx_z = z_start + dilation.depth * weight_d;
+        for(unsigned int weight_y = 0; weight_y < weights_height; ++weight_y)
+        {
+            const int idx_y = y_start + dilation.height * weight_y;
+            for(unsigned int weight_x = 0; weight_x < weights_width; ++weight_x)
+            {
+                const int idx_x = x_start + dilation.width * weight_x;
+
+                //Check if the point is within padding
+                const bool is_x_valid       = is_valid_pixel(idx_x, 0, src_width);
+                const bool is_y_valid       = is_valid_pixel(idx_y, 0, src_height);
+                const bool is_z_valid       = is_valid_pixel(idx_z, 0, src_depth);
+                const bool is_invalid_pixel = !(is_x_valid && is_y_valid && is_z_valid);
+                if(is_invalid_pixel)
+                {
+                    continue;
+                }
+
+                for(unsigned int ch_in = 0; ch_in < src_channels; ++ch_in)
+                {
+                    const T *in_ptr = src.data();
+                    const T *w_ptr  = weights.data();
+
+                    const int in_offset     = coord2index(src.shape(), Coordinates{ ch_in, idx_x, idx_y, idx_z, batch });
+                    const int weight_offset = coord2index(weights.shape(), Coordinates{ ch_out, ch_in, weight_x, weight_y, weight_d });
+                    T         input_value   = in_ptr[in_offset];
+                    T         weight_value  = w_ptr[weight_offset];
+                    total += ((input_value + input_offset) * (weight_value + weights_offset));
+                }
+            }
+        }
+    }
+
+    const TB *b_ptr      = bias.data();
+    TB        bias_value = b_ptr[ch_out];
+
+    total += bias_value;
+
+    return validation::quantize_down_scale_by_fixedpoint(total, output_multiplier, output_shift, output_offset,
+                                                         std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max());
+}
+} // namespace
+
+template <typename T, typename TB>
+SimpleTensor<T> conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const Conv3dInfo &conv3d_info)
+{
+    // Compute reference
+    const unsigned int batch_size     = src.shape()[batch_dim];
+    const unsigned int dst_width      = dst.shape()[width_dim];
+    const unsigned int dst_height     = dst.shape()[height_dim];
+    const unsigned int dst_depth      = dst.shape()[depth_dim];
+    const unsigned int src_channels   = src.shape()[channel_dim];
+    const unsigned int weights_out_ch = weights.shape()[weights_CHout_dim];
+    const unsigned int dst_channels   = dst.shape()[channel_dim];
+    const size_t       pad_left       = conv3d_info.padding.left;
+    const size_t       pad_top        = conv3d_info.padding.top;
+    const size_t       pad_front      = conv3d_info.padding.front;
+    const size_t       stride_x       = conv3d_info.stride.x();
+    const size_t       stride_y       = conv3d_info.stride.y();
+    const size_t       stride_z       = conv3d_info.stride.z();
+
+    const TensorShape dst_shape = arm_compute::misc::shape_calculator::compute_conv3d_shape(src.shape(), weights.shape(), conv3d_info);
+
+    ARM_COMPUTE_UNUSED(src_channels, weights_out_ch, dst_channels, dst_shape, weights_CHin_dim);
+    // Number of batches of source and destination tensors must match.
+    ARM_COMPUTE_ERROR_ON(src.shape()[batch_dim] != dst.shape()[batch_dim]);
+    // Input channels in the source and weights must match.
+    ARM_COMPUTE_ERROR_ON(src_channels != weights.shape()[weights_CHin_dim]);
+    // Weight channels in the destination and weights must match.
+    ARM_COMPUTE_ERROR_ON(weights_out_ch != dst_channels);
+    // Bias must match the number of destination channels.
+    ARM_COMPUTE_ERROR_ON(bias.shape()[0] != dst_channels);
+    // Compare given dst tensor shape with expected shape.
+    ARM_COMPUTE_ERROR_ON(dst.shape() != dst_shape);
+
+    for(unsigned int batch = 0; batch < batch_size; ++batch)
+    {
+        for(unsigned int z_out = 0; z_out < dst_depth; ++z_out)
+        {
+            const int z_start = (z_out * stride_z) - pad_front;
+            for(unsigned int y_out = 0; y_out < dst_height; ++y_out)
+            {
+                const int y_start = (y_out * stride_y) - pad_top;
+                for(unsigned int x_out = 0; x_out < dst_width; ++x_out)
+                {
+                    const int x_start = (x_out * stride_x) - pad_left;
+                    for(unsigned int ch_out = 0; ch_out < dst_channels; ++ch_out)
+                    {
+                        T *out_ptr = dst.data();
+
+                        const int out_offset = coord2index(dst.shape(), Coordinates{ ch_out, x_out, y_out, z_out, batch });
+                        out_ptr[out_offset]  = calculate_conv3d<T, TB>(src, weights, bias, conv3d_info.dilation, batch, z_start, y_start, x_start, ch_out, dst.quantization_info().uniform());
+                    }
+                }
+            }
+        }
+    }
+    return dst;
+}
+
+template SimpleTensor<float> conv3d(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, SimpleTensor<float> &dst,
+                                    const Conv3dInfo &conv3d_info);
+template SimpleTensor<half> conv3d(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, SimpleTensor<half> &dst,
+                                   const Conv3dInfo &conv3d_info);
+template SimpleTensor<uint8_t> conv3d(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<uint8_t> &dst,
+                                      const Conv3dInfo &conv3d_info);
+template SimpleTensor<int8_t> conv3d(const SimpleTensor<int8_t> &src, const SimpleTensor<int8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<int8_t> &dst,
+                                     const Conv3dInfo &conv3d_info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/Box3x3.h b/tests/validation/reference/Conv3D.h
index f377f280af..e3674f4bfb 100644
--- a/tests/validation/reference/Box3x3.h
+++ b/tests/validation/reference/Conv3D.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_BOX3X3_H
-#define ARM_COMPUTE_TEST_BOX3X3_H
+#ifndef ARM_COMPUTE_TEST_CONV3D_LAYER_H
+#define ARM_COMPUTE_TEST_CONV3D_LAYER_H
 
+#include "Utils.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
 #include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -34,10 +37,11 @@ namespace validation
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> box3x3(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value);
+template <typename T, typename TB>
+SimpleTensor<T> conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst,
+                       const Conv3dInfo &conv3d_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_BOX3X3_H */
+#endif /* ARM_COMPUTE_TEST_CONV3D_LAYER_H */
diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h
index 1666e3857b..b67e88e839 100644
--- a/tests/validation/reference/Convolution3d.h
+++ b/tests/validation/reference/Convolution3d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "support/Requires.h"
+#include "support/AclRequires.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
diff --git a/tests/validation/reference/DFT.cpp b/tests/validation/reference/DFT.cpp
index fd126c7d73..2b03c270ac 100644
--- a/tests/validation/reference/DFT.cpp
+++ b/tests/validation/reference/DFT.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -400,10 +400,10 @@ SimpleTensor<T> conv2d_dft(const SimpleTensor<T> &src, const SimpleTensor<T> &w,
     auto              padded_src = pad_layer(src, padding_in);
 
     // Flip weights
-    std::vector<uint32_t>  axis_v = { 0, 1 };
-    SimpleTensor<uint32_t> axis{ TensorShape(2U), DataType::U32 };
+    std::vector<uint32_t> axis_v = { 0, 1 };
+    SimpleTensor<int32_t> axis{ TensorShape(2U), DataType::S32 };
     std::copy(axis_v.begin(), axis_v.begin() + axis.shape().x(), axis.data());
-    auto flipped_w = reverse(w, axis);
+    auto flipped_w = reverse(w, axis, /* use_inverted_axis */ false);
 
     // Pad weights to have the same size as input
     const PaddingList paddings_w = { { 0, src.shape()[0] - 1 }, { 0, src.shape()[1] - 1 } };
diff --git a/tests/validation/reference/DeconvolutionLayer.cpp b/tests/validation/reference/DeconvolutionLayer.cpp
index 891828533f..eeb25fcbe3 100644
--- a/tests/validation/reference/DeconvolutionLayer.cpp
+++ b/tests/validation/reference/DeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,8 +33,8 @@ namespace validation
 {
 namespace reference
 {
-template <typename T, typename TB>
-SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape,
+template <typename T, typename TW, typename TB>
+SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape,
                                     const PadStrideInfo &info, QuantizationInfo out_qinfo)
 {
     // Create reference
@@ -99,7 +99,7 @@ SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTens
     }
 
     // Flip weights by 180 degrees
-    SimpleTensor<T> weights_flipped{ weights.shape(), weights.data_type(), 1, weights.quantization_info() };
+    SimpleTensor<TW> weights_flipped{ weights.shape(), weights.data_type(), 1, weights.quantization_info(), weights.data_layout() };
 #if defined(_OPENMP)
     #pragma omp parallel for
 #endif /* _OPENMP */
@@ -143,6 +143,8 @@ SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTens
 
 template SimpleTensor<uint8_t> deconvolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
                                                    const PadStrideInfo &info, QuantizationInfo out_quant_info);
+template SimpleTensor<uint8_t> deconvolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<int8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
+                                                   const PadStrideInfo &info, QuantizationInfo out_quant_info);
 template SimpleTensor<int8_t> deconvolution_layer(const SimpleTensor<int8_t> &src, const SimpleTensor<int8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
                                                   const PadStrideInfo &info, QuantizationInfo out_quant_info);
 template SimpleTensor<float> deconvolution_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
diff --git a/tests/validation/reference/DeconvolutionLayer.h b/tests/validation/reference/DeconvolutionLayer.h
index 07b9a531a7..16f0d9ae59 100644
--- a/tests/validation/reference/DeconvolutionLayer.h
+++ b/tests/validation/reference/DeconvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,16 +39,16 @@ namespace reference
  *
  * src              Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
  *                  Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
- * weights          The 4d weights with dimensions [width, height, OFM, IFM]. Data type supported: Same as @p input.
+ * weights          The 4d weights with dimensions [width, height, OFM, IFM]. Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
  * bias             Optional, ignored if NULL. The biases have one dimension.
- *                  Data type supported: Same as @p input, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+ *                  Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED types where biases should be of S32 type
  * output_shape     Output tensor shape. The output has the same number of dimensions as the @p input.
  * info             Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
  * a                The number of zeros added to right and top edges of the input.
  *
  */
-template <typename T, typename TB>
-SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+template <typename T, typename TW, typename TB>
+SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
                                     QuantizationInfo out_qinfo = QuantizationInfo());
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/DepthConvertLayer.cpp b/tests/validation/reference/DepthConvertLayer.cpp
index 94c719ade7..3f88897f8e 100644
--- a/tests/validation/reference/DepthConvertLayer.cpp
+++ b/tests/validation/reference/DepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -165,7 +165,7 @@ template SimpleTensor<half> depth_convert(const SimpleTensor<int32_t> &src, Data
 template SimpleTensor<float> depth_convert(const SimpleTensor<int32_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 
 // BFLOAT16
-template SimpleTensor<float> depth_convert(const SimpleTensor<bfloat16> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<bfloat16> depth_convert(const SimpleTensor<bfloat16> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 
 // F16
 template SimpleTensor<uint8_t> depth_convert(const SimpleTensor<half> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
@@ -186,6 +186,25 @@ template SimpleTensor<int32_t> depth_convert(const SimpleTensor<float> &src, Dat
 template SimpleTensor<half> depth_convert(const SimpleTensor<float> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 template SimpleTensor<bfloat16> depth_convert(const SimpleTensor<float> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 
+// S64
+template SimpleTensor<uint8_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int8_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint16_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int16_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint32_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int32_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<half> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<float> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+
+// U64
+template SimpleTensor<uint8_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int8_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint16_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int16_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint32_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int32_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<half> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<float> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DequantizationLayer.cpp b/tests/validation/reference/DequantizationLayer.cpp
index 64a89aa6a0..67d69c2c38 100644
--- a/tests/validation/reference/DequantizationLayer.cpp
+++ b/tests/validation/reference/DequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,12 @@ TOut dequantize(int16_t val, const UniformQuantizationInfo qinfo, DataType dt)
     ARM_COMPUTE_UNUSED(dt);
     return static_cast<TOut>(dequantize_qsymm16(val, qinfo));
 }
+template <typename TOut>
+TOut dequantize(int32_t val, const UniformQuantizationInfo qinfo, DataType dt)
+{
+    ARM_COMPUTE_UNUSED(dt);
+    return static_cast<TOut>(dequantize_s32(val, qinfo));
+}
 } // namespace
 template <typename TOut, typename TIn>
 SimpleTensor<TOut> dequantization_layer(const SimpleTensor<TIn> &src)
@@ -115,6 +121,7 @@ template SimpleTensor<half> dequantization_layer(const SimpleTensor<int8_t> &src
 template SimpleTensor<float> dequantization_layer(const SimpleTensor<int8_t> &src);
 template SimpleTensor<half> dequantization_layer(const SimpleTensor<int16_t> &src);
 template SimpleTensor<float> dequantization_layer(const SimpleTensor<int16_t> &src);
+template SimpleTensor<float> dequantization_layer(const SimpleTensor<int32_t> &src);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Derivative.cpp b/tests/validation/reference/Derivative.cpp
deleted file mode 100644
index c65ebcada5..0000000000
--- a/tests/validation/reference/Derivative.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "Derivative.h"
-
-#include "Utils.h"
-#include "tests/Types.h"
-
-#include <array>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-const std::array<int8_t, 9> derivative_3_x{ { 0, 0, 0, -1, 0, 1, 0, 0, 0 } };
-const std::array<int8_t, 9> derivative_3_y{ { 0, -1, 0, 0, 0, 0, 0, 1, 0 } };
-
-template <typename T>
-struct data_type;
-
-template <>
-struct data_type<int16_t>
-{
-    const static DataType value = DataType::S16;
-};
-} // namespace
-
-template <typename T, typename U>
-std::pair<SimpleTensor<T>, SimpleTensor<T>> derivative(const SimpleTensor<U> &src, BorderMode border_mode, uint8_t constant_border_value, GradientDimension gradient_dimension)
-{
-    const unsigned int filter_size = 3;
-
-    SimpleTensor<T> dst_x(src.shape(), data_type<T>::value, src.num_channels());
-    SimpleTensor<T> dst_y(src.shape(), data_type<T>::value, src.num_channels());
-
-    ValidRegion valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(filter_size / 2));
-
-    const uint32_t num_elements = src.num_elements();
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(uint32_t i = 0; i < num_elements; ++i)
-    {
-        Coordinates coord = index2coord(src.shape(), i);
-
-        if(!is_in_valid_region(valid_region, coord))
-        {
-            continue;
-        }
-
-        switch(gradient_dimension)
-        {
-            case GradientDimension::GRAD_X:
-                apply_2d_spatial_filter(coord, src, dst_x, TensorShape{ filter_size, filter_size }, derivative_3_x.data(), 1.f, border_mode,
-                                        constant_border_value);
-                break;
-            case GradientDimension::GRAD_Y:
-                apply_2d_spatial_filter(coord, src, dst_y, TensorShape{ filter_size, filter_size }, derivative_3_y.data(), 1.f, border_mode,
-                                        constant_border_value);
-                break;
-            case GradientDimension::GRAD_XY:
-                apply_2d_spatial_filter(coord, src, dst_x, TensorShape{ filter_size, filter_size }, derivative_3_x.data(), 1.f, border_mode,
-                                        constant_border_value);
-                apply_2d_spatial_filter(coord, src, dst_y, TensorShape{ filter_size, filter_size }, derivative_3_y.data(), 1.f, border_mode,
-                                        constant_border_value);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Gradient dimension not supported");
-        }
-    }
-
-    return std::make_pair(dst_x, dst_y);
-}
-
-template std::pair<SimpleTensor<int16_t>, SimpleTensor<int16_t>> derivative(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value,
-                                                                            GradientDimension gradient_dimension);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Derivative.h b/tests/validation/reference/Derivative.h
deleted file mode 100644
index 16f764e90e..0000000000
--- a/tests/validation/reference/Derivative.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_DERIVATIVE_H
-#define ARM_COMPUTE_TEST_DERIVATIVE_H
-
-#include "tests/SimpleTensor.h"
-#include "tests/Types.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename U>
-std::pair<SimpleTensor<T>, SimpleTensor<T>> derivative(const SimpleTensor<U> &src, BorderMode border_mode, uint8_t constant_border_value, GradientDimension gradient_dimension);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DERIVATIVE_H */
diff --git a/tests/validation/reference/Dilate.cpp b/tests/validation/reference/Dilate.cpp
deleted file mode 100644
index be8ccb6f3a..0000000000
--- a/tests/validation/reference/Dilate.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "Dilate.h"
-
-#include "Utils.h"
-#include "tests/validation/Helpers.h"
-
-#include <algorithm>
-#include <array>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> dilate(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value)
-{
-    /*
-             -1   x  +1
-         -1 [tl][tc][tr] -1
-          y [ml][xy][mr]  y
-         +1 [bl][bc][br] +1
-             -1   x  +1
-        dilate:
-        dst(x, y) = max[ src(x', y') for x-1<=x'<=x+1, y-1<=y'<=y+1 ] = max({tl, tc, tr, ml, xy, mr, bl, bc, br})
-    */
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-    const uint32_t  num_elements = src.num_elements();
-
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(uint32_t i = 0; i < num_elements; ++i)
-    {
-        Coordinates coord = index2coord(src.shape(), i);
-        const int   x     = coord.x();
-        const int   y     = coord.y();
-
-        std::array<T, 9> neighbours = { { 0 } };
-        for(int row = y - 1, j = 0; row <= y + 1; ++row)
-        {
-            for(int col = x - 1; col <= x + 1; ++col, ++j)
-            {
-                coord.set(0, col);
-                coord.set(1, row);
-                neighbours[j] = tensor_elem_at(src, coord, border_mode, constant_border_value);
-            }
-        }
-
-        dst[i] = *std::max_element(neighbours.cbegin(), neighbours.cend());
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint8_t> dilate(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/ElementWiseUnary.cpp b/tests/validation/reference/ElementWiseUnary.cpp
deleted file mode 100644
index 1d46ed648f..0000000000
--- a/tests/validation/reference/ElementWiseUnary.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ElementWiseUnary.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> elementwise_unary(const SimpleTensor<T> &src, ElementWiseUnary op)
-{
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-
-    for(int i = 0; i < src.num_elements(); ++i)
-    {
-        switch(op)
-        {
-            case ElementWiseUnary::RSQRT:
-                dst[i] = 1.f / std::sqrt(src[i]);
-                break;
-            case ElementWiseUnary::EXP:
-                dst[i] = std::exp(src[i]);
-                break;
-            case ElementWiseUnary::NEG:
-                dst[i] = -src[i];
-                break;
-            case ElementWiseUnary::LOG:
-                dst[i] = std::log(src[i]);
-                break;
-            case ElementWiseUnary::ABS:
-                dst[i] = std::abs(src[i]);
-                break;
-            case ElementWiseUnary::SIN:
-                dst[i] = std::sin(src[i]);
-                break;
-            case ElementWiseUnary::ROUND:
-                dst[i] = arm_compute::support::cpp11::nearbyint(src[i]);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
-    }
-
-    return dst;
-}
-
-template SimpleTensor<float> elementwise_unary(const SimpleTensor<float> &src, ElementWiseUnary op);
-template SimpleTensor<half> elementwise_unary(const SimpleTensor<half> &src, ElementWiseUnary op);
-template SimpleTensor<int32_t> elementwise_unary(const SimpleTensor<int32_t> &src, ElementWiseUnary op);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/ElementwiseOperations.cpp b/tests/validation/reference/ElementwiseOperations.cpp
index f22c84e153..edbbab8600 100644
--- a/tests/validation/reference/ElementwiseOperations.cpp
+++ b/tests/validation/reference/ElementwiseOperations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,15 +74,6 @@ T arithm_op(ArithmeticOperation op, T src1, T src2, ConvertPolicy convert_policy
         case ArithmeticOperation::DIV:
         {
             val = (static_cast<intermediate_type>(src1) / static_cast<intermediate_type>(src2));
-            if(std::is_integral<T>::value)
-            {
-                // Implement flooring division
-                val = (src2 == 0) ? 0 : val;
-                if(static_cast<int32_t>(src1) % static_cast<int32_t>(src2) != 0 && ((src1 < 0) != (src2 < 0)))
-                {
-                    --val;
-                }
-            }
             break;
         }
         case ArithmeticOperation::POWER:
diff --git a/tests/validation/reference/ElementwiseUnary.cpp b/tests/validation/reference/ElementwiseUnary.cpp
new file mode 100644
index 0000000000..558f9d24fc
--- /dev/null
+++ b/tests/validation/reference/ElementwiseUnary.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ElementwiseUnary.h"
+#include "tests/validation/Helpers.h"
+#include "utils/TypePrinter.h"
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> elementwise_unary(const SimpleTensor<T> &src, SimpleTensor<T> &dst, ElementWiseUnary op)
+{
+    for(int i = 0; i < src.num_elements(); ++i)
+    {
+        switch(op)
+        {
+            case ElementWiseUnary::RSQRT:
+                dst[i] = 1.f / std::sqrt(src[i]);
+                break;
+            case ElementWiseUnary::EXP:
+                dst[i] = std::exp(src[i]);
+                break;
+            case ElementWiseUnary::NEG:
+                dst[i] = -src[i];
+                break;
+            case ElementWiseUnary::LOG:
+                dst[i] = std::log(src[i]);
+                break;
+            case ElementWiseUnary::ABS:
+                dst[i] = std::abs(src[i]);
+                break;
+            case ElementWiseUnary::SIN:
+                dst[i] = std::sin(src[i]);
+                break;
+            case ElementWiseUnary::ROUND:
+                dst[i] = arm_compute::support::cpp11::nearbyint(src[i]);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not implemented");
+        }
+    }
+    return dst;
+}
+template <>
+SimpleTensor<int8_t> elementwise_unary(const SimpleTensor<int8_t> &src, SimpleTensor<int8_t> &dst, ElementWiseUnary op)
+{
+    if(dst.data_type() == DataType::QASYMM8_SIGNED)
+    {
+        SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+        SimpleTensor<float> dst_tmp(src.shape(), DataType::F32);
+        for(int i = 0; i < src.num_elements(); ++i)
+        {
+            switch(op)
+            {
+                case ElementWiseUnary::RSQRT:
+                    if(src_tmp[i] != 0)
+                    {
+                        dst_tmp[i] = 1.f / std::sqrt(src_tmp[i]);
+                    }
+                    else
+                    {
+                       // rsqrt(0) give 'inf' so set to the maximum in int8: 127
+                       dst_tmp[i] = (127.0f - dst.quantization_info().uniform().offset)  * dst.quantization_info().uniform().scale ;
+                    }
+                    break;
+
+                case ElementWiseUnary::LOG:
+                    if(src_tmp[i] != 0)
+                    {
+                        dst_tmp[i] = std::log(src_tmp[i]);
+                    }
+                    else
+                    {
+                       dst_tmp[i] = (-128.0f - dst.quantization_info().uniform().offset)  * dst.quantization_info().uniform().scale ;
+                    }
+                    break;
+
+                default:
+                    elementwise_unary(src_tmp, dst_tmp, op);
+                    break;
+            }
+        }
+        dst = convert_to_asymmetric<int8_t>(dst_tmp, dst.quantization_info());
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+    return dst;
+}
+template <>
+SimpleTensor<uint8_t> elementwise_unary(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, ElementWiseUnary op)
+{
+    if(dst.data_type() == DataType::QASYMM8)
+    {
+        SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+        SimpleTensor<float> dst_tmp(src.shape(), DataType::F32);
+        for(int i = 0; i < src.num_elements(); ++i)
+        {
+            switch(op)
+            {
+                case ElementWiseUnary::RSQRT:
+                    if(src_tmp[i] != 0)
+                    {
+                        dst_tmp[i] = 1.f / std::sqrt(src_tmp[i]);
+                    }
+                    else
+                    {
+                        // rsqrt(0) give 'inf' so set to the maximum in uint8: 255
+                        dst_tmp[i] = (255.0f - dst.quantization_info().uniform().offset)* dst.quantization_info().uniform().scale;
+                    }
+                    break;
+
+                case ElementWiseUnary::LOG:
+                    if(src_tmp[i] != 0)
+                    {
+                        dst_tmp[i] = std::log(src_tmp[i]);
+                    }
+                    else
+                    {
+                        dst_tmp[i] = -dst.quantization_info().uniform().offset * dst.quantization_info().uniform().scale;
+                    }
+                    break;
+
+                default:
+                    elementwise_unary(src_tmp, dst_tmp, op);
+                    break;
+            }
+        }
+        dst = convert_to_asymmetric<uint8_t>(dst_tmp, dst.quantization_info());
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+    return dst;
+}
+
+template SimpleTensor<float> elementwise_unary(const SimpleTensor<float> &src, SimpleTensor<float> &dst, ElementWiseUnary op);
+template SimpleTensor<half> elementwise_unary(const SimpleTensor<half> &src, SimpleTensor<half> &dst, ElementWiseUnary op);
+template SimpleTensor<int32_t> elementwise_unary(const SimpleTensor<int32_t> &src, SimpleTensor<int32_t> &dst, ElementWiseUnary op);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ElementWiseUnary.h b/tests/validation/reference/ElementwiseUnary.h
index be4a229a5b..ae7a49bce4 100644
--- a/tests/validation/reference/ElementWiseUnary.h
+++ b/tests/validation/reference/ElementwiseUnary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> elementwise_unary(const SimpleTensor<T> &src, ElementWiseUnary op);
+SimpleTensor<T> elementwise_unary(const SimpleTensor<T> &src, SimpleTensor<T> &dst, ElementWiseUnary op);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/EqualizeHistogram.cpp b/tests/validation/reference/EqualizeHistogram.cpp
deleted file mode 100644
index 8a957d7085..0000000000
--- a/tests/validation/reference/EqualizeHistogram.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "EqualizeHistogram.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> equalize_histogram(const SimpleTensor<T> &src)
-{
-    const size_t num_bins = 256; // 0-255 inclusive
-
-    std::vector<T>        lut(num_bins);
-    std::vector<uint32_t> hist(num_bins);
-    std::vector<uint32_t> cd(num_bins); // cumulative distribution
-
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-
-    // Create the histogram
-    for(int element_idx = 0; element_idx < src.num_elements(); ++element_idx)
-    {
-        hist[src[element_idx]]++;
-    }
-
-    // Calculate cumulative distribution
-    std::partial_sum(hist.begin(), hist.end(), cd.begin());
-
-    // Get the number of pixels that have the lowest non-zero value
-    const uint32_t cd_min = *std::find_if(hist.begin(), hist.end(), [](const uint32_t &x)
-    {
-        return x > 0;
-    });
-
-    const size_t total_num_pixels = cd.back();
-
-    // Single color - create linear distribution
-    if(total_num_pixels == cd_min)
-    {
-        std::iota(lut.begin(), lut.end(), 0);
-    }
-    else
-    {
-        const float diff = total_num_pixels - cd_min;
-
-        for(size_t i = 0; i < num_bins; ++i)
-        {
-            lut[i] = lround((cd[i] - cd_min) / diff * 255.f);
-        }
-    }
-
-    // Fill output tensor with equalized values
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(int i = 0; i < src.num_elements(); ++i)
-    {
-        dst[i] = lut[src[i]];
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint8_t> equalize_histogram(const SimpleTensor<uint8_t> &src);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/EqualizeHistogram.h b/tests/validation/reference/EqualizeHistogram.h
deleted file mode 100644
index c79b2131aa..0000000000
--- a/tests/validation/reference/EqualizeHistogram.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_EQUALIZE_HISTOGRAM_H
-#define ARM_COMPUTE_TEST_EQUALIZE_HISTOGRAM_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> equalize_histogram(const SimpleTensor<T> &src);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_EQUALIZE_HISTOGRAM_H */
diff --git a/tests/validation/reference/FastCorners.cpp b/tests/validation/reference/FastCorners.cpp
deleted file mode 100644
index 25fbf1b6f2..0000000000
--- a/tests/validation/reference/FastCorners.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "FastCorners.h"
-
-#include "Utils.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/NonMaximaSuppression.h"
-
-#include "tests/framework/Asserts.h"
-#include <iomanip>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-constexpr unsigned int bresenham_radius = 3;
-constexpr unsigned int bresenham_count  = 16;
-
-/*
-    Offsets of the 16 pixels in the Bresenham circle of radius 3 centered on P
-        . . . . . . . . .
-        . . . F 0 1 . . .
-        . . E . . . 2 . .
-        . D . . . . . 3 .
-        . C . . P . . 4 .
-        . B . . . . . 5 .
-        . . A . . . 6 . .
-        . . . 9 8 7 . . .
-        . . . . . . . . .
-*/
-const std::array<std::array<int, 2>, 16> circle_offsets =
-{
-    {
-        { { 0, -3 } },  // 0 - pixel #1
-        { { 1, -3 } },  // 1 - pixel #2
-        { { 2, -2 } },  // 2 - pixel #3
-        { { 3, -1 } },  // 3 - pixel #4
-        { { 3, 0 } },   // 4 - pixel #5
-        { { 3, 1 } },   // 5 - pixel #6
-        { { 2, 2 } },   // 6 - pixel #7
-        { { 1, 3 } },   // 7 - pixel #8
-        { { 0, 3 } },   // 8 - pixel #9
-        { { -1, 3 } },  // 9 - pixel #10
-        { { -2, 2 } },  // A - pixel #11
-        { { -3, 1 } },  // B - pixel #12
-        { { -3, 0 } },  // C - pixel #13
-        { { -3, -1 } }, // D - pixel #14
-        { { -2, -2 } }, // E - pixel #15
-        { { -1, -3 } }  // F - pixel #16
-    }
-};
-
-/*
-    FAST-9 bit masks for consecutive points surrounding a corner candidate
-    Rejection of non-corners is expedited by checking pixels 1, 9, then 5, 13...
-*/
-const std::array<uint16_t, 16> fast9_masks =
-{
-    {
-        0x01FF, // 0000 0001 1111 1111
-        0x03FE, // 0000 0011 1111 1110
-        0x07FC, // 0000 0111 1111 1100
-        0x0FF8, // 0000 1111 1111 1000
-        0x1FF0, // 0001 1111 1111 0000
-        0x3FE0, // 0011 1111 1110 0000
-        0x7FC0, // 0111 1111 1100 0000
-        0xFF80, // 1111 1111 1000 0000
-        0xFF01, // 1111 1111 0000 0001
-        0xFE03, // 1111 1110 0000 0011
-        0xFC07, // 1111 1100 0000 0111
-        0xF80F, // 1111 1000 0000 1111
-        0xF01F, // 1111 0000 0001 1111
-        0xE03F, // 1110 0000 0011 1111
-        0xC07F, // 1100 0000 0111 1111
-        0x80FF  // 1000 0000 1111 1111
-    }
-};
-
-inline bool in_range(const uint8_t low, const uint8_t high, const uint8_t val)
-{
-    return low <= val && val <= high;
-}
-
-template <typename T, typename F>
-bool is_a_corner(const Coordinates &candidate, const SimpleTensor<T> &src, uint8_t threshold, BorderMode border_mode, T constant_border_value, F intensity_at)
-{
-    const auto intensity_p   = tensor_elem_at(src, candidate, border_mode, constant_border_value);
-    const auto thresh_bright = intensity_p + threshold;
-    const auto thresh_dark   = intensity_p - threshold;
-
-    // Quicker rejection of non-corner points by checking pixels 1, 9 then 5, 13 around the candidate
-    const auto p1  = intensity_at(candidate, 0);
-    const auto p9  = intensity_at(candidate, 8);
-    const auto p5  = intensity_at(candidate, 4);
-    const auto p13 = intensity_at(candidate, 12);
-
-    if((in_range(thresh_dark, thresh_bright, p1) && in_range(thresh_dark, thresh_bright, p9))
-       || (in_range(thresh_dark, thresh_bright, p5) && in_range(thresh_dark, thresh_bright, p13)))
-    {
-        return false;
-    }
-
-    uint16_t mask_bright = 0;
-    uint16_t mask_dark   = 0;
-
-    // Set bits of the brighter/darker pixels mask accordingly
-    for(unsigned int n = 0; n < bresenham_count; ++n)
-    {
-        T intensity_n = intensity_at(candidate, n);
-        mask_bright |= (intensity_n > thresh_bright) << n;
-        mask_dark |= (intensity_n < thresh_dark) << n;
-    }
-
-    // Mark as corner candidate if brighter/darker pixel sequence satisfies any one of the FAST-9 masks
-    const auto found = std::find_if(fast9_masks.begin(), fast9_masks.end(), [&](decltype(fast9_masks[0]) mask)
-    {
-        return (mask_bright & mask) == mask || (mask_dark & mask) == mask;
-    });
-
-    return found != fast9_masks.end();
-}
-} // namespace
-
-template <typename T>
-std::vector<KeyPoint> fast_corners(const SimpleTensor<T> &src, float input_thresh, bool suppress_nonmax, BorderMode border_mode, T constant_border_value)
-{
-    // Get intensity of pixel at given index on the Bresenham circle around a candidate point
-    const auto intensity_at = [&](const Coordinates & point, const unsigned int idx)
-    {
-        const auto  offset = circle_offsets[idx];
-        Coordinates px{ point.x() + offset[0], point.y() + offset[1] };
-        return tensor_elem_at(src, px, border_mode, constant_border_value);
-    };
-
-    const auto            threshold = static_cast<uint8_t>(input_thresh);
-    std::vector<KeyPoint> corners;
-
-    // 1. Detect potential corners (the segment test)
-    std::vector<Coordinates> corner_candidates;
-    SimpleTensor<uint8_t>    scores(src.shape(), DataType::U8);
-    ValidRegion              valid_region = shape_to_valid_region(src.shape(), BorderMode::UNDEFINED == border_mode, BorderSize(bresenham_radius));
-
-    const uint32_t num_elements = src.num_elements();
-    for(uint32_t i = 0; i < num_elements; ++i)
-    {
-        Coordinates candidate = index2coord(src.shape(), i);
-        scores[i]             = 0;
-        if(!is_in_valid_region(valid_region, candidate))
-        {
-            continue;
-        }
-
-        if(is_a_corner(candidate, src, threshold, border_mode, constant_border_value, intensity_at))
-        {
-            corner_candidates.emplace_back(candidate);
-            scores[i] = 1;
-        }
-    }
-
-    // 2. Calculate corner scores if necessary
-    if(suppress_nonmax)
-    {
-        for(const auto &candidate : corner_candidates)
-        {
-            const auto index      = coord2index(scores.shape(), candidate);
-            uint8_t    thresh_max = UINT8_MAX;
-            uint8_t    thresh_min = threshold;
-            uint8_t    response   = (thresh_min + thresh_max) / 2;
-
-            // Corner score (response) is the largest threshold for which the pixel remains a corner
-            while(thresh_max - thresh_min > 1)
-            {
-                response = (thresh_min + thresh_max) / 2;
-                if(is_a_corner(candidate, src, response, border_mode, constant_border_value, intensity_at))
-                {
-                    thresh_min = response; // raise threshold
-                }
-                else
-                {
-                    thresh_max = response; // lower threshold
-                }
-            }
-            scores[index] = thresh_min;
-        }
-
-        scores       = non_maxima_suppression(scores, border_mode, constant_border_value);
-        valid_region = shape_to_valid_region(scores.shape(), BorderMode::UNDEFINED == border_mode, BorderSize(bresenham_radius + 1));
-    }
-
-    for(const auto &candidate : corner_candidates)
-    {
-        const auto index = coord2index(scores.shape(), candidate);
-        if(scores[index] > 0.f && is_in_valid_region(valid_region, candidate))
-        {
-            KeyPoint corner;
-            corner.x               = candidate.x();
-            corner.y               = candidate.y();
-            corner.strength        = scores[index];
-            corner.tracking_status = 1;
-            corner.scale           = 0.f;
-            corner.orientation     = 0.f;
-            corner.error           = 0.f;
-            corners.emplace_back(corner);
-        }
-    }
-
-    return corners;
-}
-
-template std::vector<KeyPoint> fast_corners(const SimpleTensor<uint8_t> &src, float threshold, bool suppress_nonmax, BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/FullyConnectedLayer.cpp b/tests/validation/reference/FullyConnectedLayer.cpp
index 21333958f8..af30e9ee54 100644
--- a/tests/validation/reference/FullyConnectedLayer.cpp
+++ b/tests/validation/reference/FullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,7 +123,7 @@ SimpleTensor<T> fully_connected_layer(const SimpleTensor<T> &src, const SimpleTe
     // Create reference
     SimpleTensor<T> dst{ TensorShape{ dst_shape }, src.data_type(), 1, out_quant_info };
 
-    // Sanity checks
+    // Health checks
     const int          num_batch_dimensions = std::max(0, static_cast<int>(dst_shape.num_dimensions()) - 1);
     const int          num_input_dimensions = src.shape().num_dimensions() - num_batch_dimensions;
     const unsigned int linear_input_size    = src.shape().total_size_lower(num_input_dimensions);
diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index 6b3aa390f0..d513343796 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
 
 namespace arm_compute
 {
@@ -35,10 +36,11 @@ namespace validation
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
+SimpleTensor<T>
+gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
 {
     // Create reference
-    SimpleTensor<T> dst{ c.shape(), c.data_type(), 1 };
+    SimpleTensor<T> dst{c.shape(), c.data_type(), 1};
 
     // Compute reference
     const int M = a.shape().y();
@@ -50,30 +52,47 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
     const int a_stride_z = K * M;
     const int a_stride_w = K * M * D;
 
-    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
-    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+    const int b_stride_z =
+        b.shape().num_dimensions() > 2
+        ? N * K
+        : 0; // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    int b_stride_w =
+        b.shape().num_dimensions() > 3
+        ? K * N * D
+        : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    // Note: There are 3 gemm types: batched-gemm, multi-gemm, and batched of multi-gemms. The third dimension of tensor b is overloaded when tensor b has exactly 3 dimensions:
+    // it can be either number of batches or multis. Batched-GEMM computation is detected only when the third dimension of "a" and "c" tensors is 1 and the number of dimensions is 4
+    const bool is_batched_gemm = b.shape().num_dimensions() == 3 && a.shape().num_dimensions() == 4 &&
+                                 c.shape().num_dimensions() == 4 && a.shape()[2] == 1 && c.shape()[2] == 1;
+
+    // Batched-GEMM
+    if (is_batched_gemm)
+    {
+        b_stride_w = b_stride_z;
+    }
 
     const int c_stride_z = N * M;
     const int c_stride_w = N * M * D;
 
-#if defined(_OPENMP) && !( defined(__arm__) && defined(__ANDROID__))
+#if defined(_OPENMP) && !(defined(__arm__) && defined(__ANDROID__))
     #pragma omp parallel for collapse(2)
 #endif /* _OPENMP */
-    for(int w = 0; w < W; ++w)
+    for (int w = 0; w < W; ++w)
     {
-        for(int depth = 0; depth < D; ++depth)
+        for (int depth = 0; depth < D; ++depth)
         {
             const int base_addr_a = depth * a_stride_z + w * a_stride_w;
             const int base_addr_b = depth * b_stride_z + w * b_stride_w;
             const int base_addr_c = depth * c_stride_z + w * c_stride_w;
 
-            for(int row = 0; row < M; ++row)
+            for (int row = 0; row < M; ++row)
             {
-                for(int col = 0; col < N; ++col)
+                for (int col = 0; col < N; ++col)
                 {
                     T acc(0);
 
-                    for(int k = 0; k < K; ++k)
+                    for (int k = 0; k < K; ++k)
                     {
                         acc += a[base_addr_a + k + row * K] * b[base_addr_b + col + k * N];
                     }
@@ -89,11 +108,12 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
 }
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
+SimpleTensor<T> gemm_mixed_precision(
+    const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
 {
     // GEMM mixed-precision combines F32 accumulators with F16 multiplications
     // Create reference
-    SimpleTensor<T> dst{ c.shape(), c.data_type(), 1 };
+    SimpleTensor<T> dst{c.shape(), c.data_type(), 1};
 
     // Compute reference
     const int M = a.shape().y();
@@ -105,36 +125,54 @@ SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTenso
     const int a_stride_z = K * M;
     const int a_stride_w = K * M * D;
 
-    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
-    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+    const int b_stride_z =
+        b.shape().num_dimensions() > 2
+        ? N * K
+        : 0; // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    int b_stride_w =
+        b.shape().num_dimensions() > 3
+        ? K * N * D
+        : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    // Note: There are 3 gemm types: batched-gemm, multi-gemm, and batched of multi-gemms. The third dimension of tensor b is overloaded when tensor b has exactly 3 dimensions:
+    // it can be either number of batches or multis. Batched-GEMM computation is detected only when the third dimension of "a" and "c" tensors is 1 and the number of dimensions is 4
+    const bool is_batched_gemm = b.shape().num_dimensions() == 3 && a.shape().num_dimensions() == 4 &&
+                                 c.shape().num_dimensions() == 4 && a.shape()[2] == 1 && c.shape()[2] == 1;
+
+    // Batched-GEMM
+    if (is_batched_gemm)
+    {
+        b_stride_w = b_stride_z;
+    }
 
     const int c_stride_z = N * M;
     const int c_stride_w = N * M * D;
 
-#if defined(_OPENMP) && !( defined(__arm__) && defined(__ANDROID__))
+#if defined(_OPENMP) && !(defined(__arm__) && defined(__ANDROID__))
     #pragma omp parallel for collapse(2)
 #endif /* _OPENMP */
-    for(int w = 0; w < W; ++w)
+    for (int w = 0; w < W; ++w)
     {
-        for(int depth = 0; depth < D; ++depth)
+        for (int depth = 0; depth < D; ++depth)
         {
             const int base_addr_a = depth * a_stride_z + w * a_stride_w;
             const int base_addr_b = depth * b_stride_z + w * b_stride_w;
             const int base_addr_c = depth * c_stride_z + w * c_stride_w;
 
-            for(int row = 0; row < M; ++row)
+            for (int row = 0; row < M; ++row)
             {
-                for(int col = 0; col < N; ++col)
+                for (int col = 0; col < N; ++col)
                 {
                     float acc(0);
 
-                    for(int k = 0; k < K; ++k)
+                    for (int k = 0; k < K; ++k)
                     {
                         acc += static_cast<float>(a[base_addr_a + k + row * K] * b[base_addr_b + col + k * N]);
                     }
 
                     // Finalize the result: alpha * A * B + beta * C
-                    dst[base_addr_c + col + row * N] = static_cast<T>(alpha * acc + beta * c[base_addr_c + col + row * N]);
+                    dst[base_addr_c + col + row * N] =
+                        static_cast<T>(alpha * acc + beta * c[base_addr_c + col + row * N]);
                 }
             }
         }
@@ -143,8 +181,21 @@ SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTenso
     return dst;
 }
 
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+void gemm_accumulate(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta, SimpleTensor<T> &dst)
+{
+    // Compute reference
+    SimpleTensor<T> dst_gemm = gemm(a, b, c, alpha, beta);
+    reference::arithmetic_operation<T>(reference::ArithmeticOperation::ADD, dst, dst_gemm, dst, ConvertPolicy::SATURATE);
+}
+
+template SimpleTensor<bfloat16> gemm(const SimpleTensor<bfloat16> &a, const SimpleTensor<bfloat16> &b, const SimpleTensor<bfloat16> &c, float alpha, float beta);
 template SimpleTensor<float> gemm(const SimpleTensor<float> &a, const SimpleTensor<float> &b, const SimpleTensor<float> &c, float alpha, float beta);
 template SimpleTensor<half> gemm(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
+
+template void gemm_accumulate(const SimpleTensor<float> &a, const SimpleTensor<float> &b, const SimpleTensor<float> &c, float alpha, float beta, SimpleTensor<float> &dst);
+template void gemm_accumulate(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta, SimpleTensor<half> &dst);
+
 template SimpleTensor<half> gemm_mixed_precision(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/GEMM.h b/tests/validation/reference/GEMM.h
index 5feaeda584..1b97570122 100644
--- a/tests/validation/reference/GEMM.h
+++ b/tests/validation/reference/GEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMM_H
-#define ARM_COMPUTE_TEST_GEMM_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_GEMM_H
+#define ACL_TESTS_VALIDATION_REFERENCE_GEMM_H
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -41,8 +41,11 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta);
 
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+void gemm_accumulate(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta, SimpleTensor<T> &dst);
+
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMM_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_GEMM_H
diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp
index 1615b51e73..30c577d850 100644
--- a/tests/validation/reference/GEMMLowp.cpp
+++ b/tests/validation/reference/GEMMLowp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "GEMMLowp.h"
 
 #include "arm_compute/core/Types.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
 #include "support/ToolchainSupport.h"
@@ -230,6 +231,13 @@ SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, c
     return c;
 }
 
+template <typename T_out, typename T_in, typename T_in_1>
+void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<T_in> &a, const SimpleTensor<T_in_1> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<T_out> &dst)
+{
+    SimpleTensor<T_out> dst_gemm = gemmlowp_matrix_multiply_core<T_out, T_in, T_in_1>(a, b, shape_c, a_offset, b_offset);
+    reference::arithmetic_operation<T_out>(reference::ArithmeticOperation::ADD, dst, dst_gemm, dst, ConvertPolicy::SATURATE);
+}
+
 // used to validate assembly kernels which don't know anything about offsets
 template <typename T1, typename T2, typename T3>
 SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c)
@@ -336,6 +344,8 @@ template SimpleTensor<int8_t> gemmlowp_quantize_down_scale(const SimpleTensor<in
                                                            std::vector<int32_t> result_shift, int32_t min, int32_t max);
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
+template void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<int32_t> &dst);
+template void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<int32_t> &dst);
 template SimpleTensor<int32_t> gemmlowp<int32_t, int8_t, int8_t>(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
 template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, uint8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c);
 template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, int8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
diff --git a/tests/validation/reference/GEMMLowp.h b/tests/validation/reference/GEMMLowp.h
index 99015d71fb..6e471fdad1 100644
--- a/tests/validation/reference/GEMMLowp.h
+++ b/tests/validation/reference/GEMMLowp.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMMLOWP_H
-#define ARM_COMPUTE_TEST_GEMMLOWP_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_GEMMLOWP_H
+#define ACL_TESTS_VALIDATION_REFERENCE_GEMMLOWP_H
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -38,6 +38,9 @@ namespace reference
 template <typename T1, typename T2, typename T3>
 SimpleTensor<T1> gemmlowp_matrix_multiply_core(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
 
+template <typename T1, typename T2, typename T3>
+void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<T1> &dst_);
+
 template <typename T1, typename T2, typename T3 = T2>
 SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c);
 
@@ -71,4 +74,4 @@ SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn>
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMLOWP_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_GEMMLOWP_H
diff --git a/tests/validation/reference/Gather.cpp b/tests/validation/reference/Gather.cpp
index 93ac09cf95..c90c04f8cc 100644
--- a/tests/validation/reference/Gather.cpp
+++ b/tests/validation/reference/Gather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,27 +39,56 @@ namespace reference
 template <typename T>
 SimpleTensor<T> gather(const SimpleTensor<T> &src, const SimpleTensor<uint32_t> &indices, uint32_t actual_axis)
 {
-    const auto       *indices_ptr = static_cast<const uint32_t *>(indices.data());
     const TensorShape dst_shape   = arm_compute::misc::shape_calculator::compute_gather_shape(src.shape(), indices.shape(), actual_axis);
     SimpleTensor<T>   dst(dst_shape, src.data_type());
 
+    const auto        src_ptr     = static_cast<const T *>(src.data());
+    const auto        indices_ptr = static_cast<const uint32_t *>(indices.data());
+    const auto        dst_ptr     = static_cast<T *>(dst.data());
+
+    const uint32_t index_limit = src.shape()[actual_axis];
+
     Window win;
     win.use_tensor_dimensions(dst_shape);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        Coordinates offset;
-        for(unsigned int dim = 0; dim < id.num_dimensions(); ++dim)
+
+    execute_window_loop(win, [&](const Coordinates &dst_coords) {
+        const auto dst_addr = coords2index(dst.shape(), dst_coords);
+
+        // Calculate the coordinates of the index value.
+        Coordinates idx_coords;
+
+        for(size_t i = 0; i < indices.shape().num_dimensions(); ++i)
         {
-            if(dim == actual_axis)
+            idx_coords.set(i, dst_coords[i + actual_axis]);
+        }
+
+        const auto index = indices_ptr[coords2index(indices.shape(), idx_coords)];
+
+        if(index < index_limit)
+        {
+            // Calculate the coordinates of the source data.
+            Coordinates src_coords;
+
+            for(size_t i = 0; i < actual_axis; ++i)
             {
-                offset.set(dim, indices_ptr[id[dim]]);
+                src_coords.set(i, dst_coords[i]);
             }
-            else
+
+            src_coords.set(actual_axis, index);
+
+            for(size_t i = actual_axis + 1; i < src.shape().num_dimensions(); ++i)
             {
-                offset.set(dim, id[dim]);
+                src_coords.set(i, dst_coords[i + indices.shape().num_dimensions() - 1]);
             }
+
+            // Copy the data.
+            const auto src_addr = coords2index(src.shape(), src_coords);
+            dst_ptr[dst_addr] = src_ptr[src_addr];
+        }
+        else
+        {
+            dst_ptr[dst_addr] = 0;
         }
-        *reinterpret_cast<T *>(dst(id)) = *reinterpret_cast<const T *>(src(offset));
     });
 
     return dst;
@@ -72,4 +101,4 @@ template SimpleTensor<uint8_t> gather(const SimpleTensor<uint8_t> &src, const Si
 } // namespace reference
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/reference/Gaussian3x3.cpp b/tests/validation/reference/Gaussian3x3.cpp
deleted file mode 100644
index 2e307e8152..0000000000
--- a/tests/validation/reference/Gaussian3x3.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-
-#include "Gaussian3x3.h"
-#include "Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> gaussian3x3(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value)
-{
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-    const std::array<T, 9> filter{ { 1, 2, 1, 2, 4, 2, 1, 2, 1 } };
-    const float    scale        = 1.f / 16.f;
-    const uint32_t num_elements = src.num_elements();
-
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
-    {
-        const Coordinates id = index2coord(src.shape(), element_idx);
-        apply_2d_spatial_filter(id, src, dst, TensorShape(3U, 3U), filter.data(), scale, border_mode, constant_border_value);
-    }
-    return dst;
-}
-
-template SimpleTensor<uint8_t> gaussian3x3(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Gaussian3x3.h b/tests/validation/reference/Gaussian3x3.h
deleted file mode 100644
index a433db6693..0000000000
--- a/tests/validation/reference/Gaussian3x3.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_GAUSSIAN3X3_H
-#define ARM_COMPUTE_TEST_GAUSSIAN3X3_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> gaussian3x3(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GAUSSIAN3X3_H */
diff --git a/tests/validation/reference/Gaussian5x5.cpp b/tests/validation/reference/Gaussian5x5.cpp
deleted file mode 100644
index 2133d8980e..0000000000
--- a/tests/validation/reference/Gaussian5x5.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-
-#include "Gaussian5x5.h"
-#include "Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> gaussian5x5(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value)
-{
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-    const std::array<T, 25> filter{ {
-            1, 4, 6, 4, 1,
-            4, 16, 24, 16, 4,
-            6, 24, 36, 24, 6,
-            4, 16, 24, 16, 4,
-            1, 4, 6, 4, 1
-        } };
-    const float    scale        = 1.f / 256.f;
-    const uint32_t num_elements = src.num_elements();
-
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
-    {
-        const Coordinates id = index2coord(src.shape(), element_idx);
-        apply_2d_spatial_filter(id, src, dst, TensorShape(5U, 5U), filter.data(), scale, border_mode, constant_border_value);
-    }
-    return dst;
-}
-
-template SimpleTensor<uint8_t> gaussian5x5(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Gaussian5x5.h b/tests/validation/reference/Gaussian5x5.h
deleted file mode 100644
index 42920bd4c0..0000000000
--- a/tests/validation/reference/Gaussian5x5.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_GAUSSIAN5X5_H
-#define ARM_COMPUTE_TEST_GAUSSIAN5X5_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> gaussian5x5(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GAUSSIAN5X5_H */
diff --git a/tests/validation/reference/GaussianPyramidHalf.cpp b/tests/validation/reference/GaussianPyramidHalf.cpp
deleted file mode 100644
index 5bddd853a0..0000000000
--- a/tests/validation/reference/GaussianPyramidHalf.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "GaussianPyramidHalf.h"
-
-#include "arm_compute/core/Helpers.h"
-
-#include "Gaussian5x5.h"
-#include "Scale.h"
-#include "Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-std::vector<SimpleTensor<T>> gaussian_pyramid_half(const SimpleTensor<T> &src, BorderMode border_mode, uint8_t constant_border_value, size_t num_levels)
-{
-    std::vector<SimpleTensor<T>> dst;
-
-    // Level0 is equal to src
-    dst.push_back(src);
-
-    for(size_t i = 1; i < num_levels; ++i)
-    {
-        // Gaussian Filter
-        const SimpleTensor<T> out_gaus5x5 = reference::gaussian5x5(dst[i - 1], border_mode, constant_border_value);
-
-        // Scale down by 2 with nearest interpolation
-        const SimpleTensor<T> out = reference::scale(out_gaus5x5, SCALE_PYRAMID_HALF, SCALE_PYRAMID_HALF, InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value, SamplingPolicy::CENTER,
-                                                     true);
-
-        dst.push_back(out);
-    }
-
-    return dst;
-}
-
-template std::vector<SimpleTensor<uint8_t>> gaussian_pyramid_half(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value, size_t num_levels);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/GaussianPyramidHalf.h b/tests/validation/reference/GaussianPyramidHalf.h
deleted file mode 100644
index 225ef00727..0000000000
--- a/tests/validation/reference/GaussianPyramidHalf.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_GAUSSIAN_PYRAMID_HALF_H
-#define ARM_COMPUTE_TEST_GAUSSIAN_PYRAMID_HALF_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-std::vector<SimpleTensor<T>> gaussian_pyramid_half(const SimpleTensor<T> &src, BorderMode border_mode, uint8_t constant_border_value, size_t num_levels);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GAUSSIAN_PYRAMID_HALF_H */
-\ No newline at end of file
diff --git a/tests/validation/reference/HOGDescriptor.cpp b/tests/validation/reference/HOGDescriptor.cpp
deleted file mode 100644
index e00beaf5d7..0000000000
--- a/tests/validation/reference/HOGDescriptor.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "HOGDescriptor.h"
-
-#include "Derivative.h"
-#include "Magnitude.h"
-#include "Phase.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-template <typename T>
-void hog_orientation_compute(const SimpleTensor<T> &mag, const SimpleTensor<T> &phase, std::vector<T> &bins, const HOGInfo &hog_info)
-{
-    const Size2D &cell_size = hog_info.cell_size();
-    const size_t  num_bins  = hog_info.num_bins();
-
-    float phase_scale = (PhaseType::SIGNED == hog_info.phase_type() ? num_bins / 360.0f : num_bins / 180.0f);
-    phase_scale *= (PhaseType::SIGNED == hog_info.phase_type() ? 360.0f / 255.0f : 1.0f);
-
-    int row_idx = 0;
-    for(size_t yc = 0; yc < cell_size.height; ++yc)
-    {
-        for(size_t xc = 0; xc < cell_size.width; xc++)
-        {
-            const float mag_value   = mag[(row_idx + xc)];
-            const float phase_value = phase[(row_idx + xc)] * phase_scale + 0.5f;
-            const float w1          = phase_value - floor(phase_value);
-
-            // The quantised phase is the histogram index [0, num_bins - 1]
-            // Check limit of histogram index. If hidx == num_bins, hidx = 0
-            const auto hidx = static_cast<unsigned int>(phase_value) % num_bins;
-
-            // Weighted vote between 2 bins
-            bins[hidx] += mag_value * (1.0f - w1);
-            bins[(hidx + 1) % num_bins] += mag_value * w1;
-        }
-
-        row_idx += cell_size.width;
-    }
-}
-
-template <typename T>
-void hog_block_normalization_compute(SimpleTensor<T> &block, SimpleTensor<T> &desc, const HOGInfo &hog_info, uint32_t block_idx)
-{
-    const int         num_bins_per_block = desc.num_channels();
-    const HOGNormType norm_type          = hog_info.normalization_type();
-    const Coordinates id                 = index2coord(desc.shape(), block_idx);
-
-    float sum = 0.0f;
-
-    // Calculate sum
-    for(int i = 0; i < num_bins_per_block; ++i)
-    {
-        const float val = block[i];
-        sum += (norm_type == HOGNormType::L1_NORM) ? std::fabs(val) : val * val;
-    }
-
-    // Calculate normalization scale
-    float scale = 1.0f / (std::sqrt(sum) + num_bins_per_block * 0.1f);
-
-    if(norm_type == HOGNormType::L2HYS_NORM)
-    {
-        // Reset sum
-        sum = 0.0f;
-        for(int i = 0; i < num_bins_per_block; ++i)
-        {
-            float val = block[i] * scale;
-
-            // Clip scaled input_value if over l2_hyst_threshold
-            val = fmin(val, hog_info.l2_hyst_threshold());
-            sum += val * val;
-            block[i] = val;
-        }
-
-        // We use the same constants of OpenCV
-        scale = 1.0f / (std::sqrt(sum) + 1e-3f);
-    }
-
-    for(int i = 0; i < num_bins_per_block; ++i)
-    {
-        block[i] *= scale;
-        reinterpret_cast<float *>(desc(id))[i] = block[i];
-    }
-}
-} // namespace
-
-template <typename T, typename U, typename V>
-void hog_orientation_binning(const SimpleTensor<T> &mag, const SimpleTensor<U> &phase, SimpleTensor<V> &hog_space, const HOGInfo &hog_info)
-{
-    const Size2D &cell_size = hog_info.cell_size();
-
-    const size_t num_bins     = hog_info.num_bins();
-    const size_t shape_width  = hog_space.shape().x() * hog_info.cell_size().width;
-    const size_t shape_height = hog_space.shape().y() * hog_info.cell_size().height;
-
-    TensorShape cell_shape(cell_size.width, cell_size.height);
-
-    SimpleTensor<V> mag_cell(cell_shape, DataType::F32);
-    SimpleTensor<V> phase_cell(cell_shape, DataType::F32);
-
-    int cell_idx = 0;
-    int y_offset = 0;
-
-    // Traverse shape
-    for(auto sy = cell_size.height; sy <= shape_height; sy += cell_size.height)
-    {
-        int x_offset = 0;
-        for(auto sx = cell_size.width; sx <= shape_width; sx += cell_size.width)
-        {
-            int row_idx  = 0;
-            int elem_idx = 0;
-
-            // Traverse cell
-            for(auto y = 0u; y < cell_size.height; ++y)
-            {
-                for(auto x = 0u; x < cell_size.width; ++x)
-                {
-                    int shape_idx        = x + row_idx + x_offset + y_offset;
-                    mag_cell[elem_idx]   = mag[shape_idx];
-                    phase_cell[elem_idx] = phase[shape_idx];
-                    elem_idx++;
-                }
-
-                row_idx += shape_width;
-            }
-
-            // Partition magnitude values into bins based on phase values
-            std::vector<V> bins(num_bins);
-            hog_orientation_compute(mag_cell, phase_cell, bins, hog_info);
-
-            for(size_t i = 0; i < num_bins; ++i)
-            {
-                hog_space[cell_idx * num_bins + i] = bins[i];
-            }
-
-            x_offset += cell_size.width;
-            cell_idx++;
-        }
-
-        y_offset += (cell_size.height * shape_width);
-    }
-}
-
-template <typename T>
-void hog_block_normalization(SimpleTensor<T> &desc, const SimpleTensor<T> &hog_space, const HOGInfo &hog_info)
-{
-    const Size2D  cells_per_block        = hog_info.num_cells_per_block();
-    const Size2D  cells_per_block_stride = hog_info.num_cells_per_block_stride();
-    const Size2D &block_size             = hog_info.block_size();
-    const Size2D &block_stride           = hog_info.block_stride();
-    const size_t  num_bins               = hog_info.num_bins();
-
-    const size_t shape_width          = hog_space.shape().x() * hog_info.cell_size().width;
-    const size_t shape_height         = hog_space.shape().y() * hog_info.cell_size().height;
-    const size_t num_bins_per_block_x = cells_per_block.width * num_bins;
-
-    // Tensor representing single block
-    SimpleTensor<T> block(TensorShape{ 1u, 1u }, DataType::F32, cells_per_block.area() * num_bins);
-
-    uint32_t block_idx      = 0;
-    int      block_y_offset = 0;
-
-    // Traverse shape
-    for(auto sy = block_size.height; sy <= shape_height; sy += block_stride.height)
-    {
-        int block_x_offset = 0;
-        for(auto sx = block_size.width; sx <= shape_width; sx += block_stride.width)
-        {
-            int cell_y_offset = 0;
-            int elem_idx      = 0;
-
-            // Traverse block
-            for(auto y = 0u; y < cells_per_block.height; ++y)
-            {
-                for(auto x = 0u; x < num_bins_per_block_x; ++x)
-                {
-                    int idx         = x + cell_y_offset + block_x_offset + block_y_offset;
-                    block[elem_idx] = hog_space[idx];
-                    elem_idx++;
-                }
-
-                cell_y_offset += hog_space.shape().x() * num_bins;
-            }
-
-            // Normalize block and write to descriptor
-            hog_block_normalization_compute(block, desc, hog_info, block_idx);
-
-            block_x_offset += cells_per_block_stride.width * num_bins;
-            block_idx++;
-        }
-
-        block_y_offset += cells_per_block_stride.height * num_bins * hog_space.shape().x();
-    }
-}
-
-template <typename T, typename U>
-SimpleTensor<T> hog_descriptor(const SimpleTensor<U> &src, BorderMode border_mode, U constant_border_value, const HOGInfo &hog_info)
-{
-    SimpleTensor<int16_t> grad_x;
-    SimpleTensor<int16_t> grad_y;
-
-    // Create tensor info for HOG descriptor
-    TensorInfo      desc_info(hog_info, src.shape().x(), src.shape().y());
-    SimpleTensor<T> desc(desc_info.tensor_shape(), DataType::F32, desc_info.num_channels());
-
-    // Create HOG space tensor (num_cells_x, num_cells_y)
-    TensorShape hog_space_shape(src.shape().x() / hog_info.cell_size().width,
-                                src.shape().y() / hog_info.cell_size().height);
-
-    // For each cell a histogram with a num_bins is created
-    TensorInfo      info_hog_space(hog_space_shape, hog_info.num_bins(), DataType::F32);
-    SimpleTensor<T> hog_space(info_hog_space.tensor_shape(), DataType::F32, info_hog_space.num_channels());
-
-    // Calculate derivative
-    std::tie(grad_x, grad_y) = derivative<int16_t>(src, border_mode, constant_border_value, GradientDimension::GRAD_XY);
-
-    // For each cell create histogram based on magnitude and phase
-    hog_orientation_binning(magnitude(grad_x, grad_y, MagnitudeType::L2NORM),
-                            phase(grad_x, grad_y, hog_info.phase_type()),
-                            hog_space,
-                            hog_info);
-
-    // Normalize histograms based on block size
-    hog_block_normalization(desc, hog_space, hog_info);
-
-    return desc;
-}
-
-template void hog_orientation_binning(const SimpleTensor<int16_t> &mag, const SimpleTensor<uint8_t> &phase, SimpleTensor<float> &hog_space, const HOGInfo &hog_info);
-template void hog_block_normalization(SimpleTensor<float> &desc, const SimpleTensor<float> &hog_space, const HOGInfo &hog_info);
-template SimpleTensor<float> hog_descriptor(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value, const HOGInfo &hog_info);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/HOGDetector.cpp b/tests/validation/reference/HOGDetector.cpp
deleted file mode 100644
index 798c3fc142..0000000000
--- a/tests/validation/reference/HOGDetector.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "HOGDetector.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-/** Computes the number of detection windows to iterate over in the feature vector. */
-Size2D num_detection_windows(const TensorShape &shape, const Size2D &window_step, const HOGInfo &hog_info)
-{
-    const size_t num_block_strides_width  = hog_info.detection_window_size().width / hog_info.block_stride().width;
-    const size_t num_block_strides_height = hog_info.detection_window_size().height / hog_info.block_stride().height;
-
-    return Size2D{ floor_to_multiple(shape.x() - num_block_strides_width, window_step.width) + window_step.width,
-                   floor_to_multiple(shape.y() - num_block_strides_height, window_step.height) + window_step.height };
-}
-} // namespace
-
-template <typename T>
-std::vector<DetectionWindow> hog_detector(const SimpleTensor<T> &src, const std::vector<T> &descriptor, unsigned int max_num_detection_windows,
-                                          const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
-{
-    ARM_COMPUTE_ERROR_ON_MSG((detection_window_stride.width % hog_info.block_stride().width != 0),
-                             "Detection window stride width must be multiple of block stride width");
-    ARM_COMPUTE_ERROR_ON_MSG((detection_window_stride.height % hog_info.block_stride().height != 0),
-                             "Detection window stride height must be multiple of block stride height");
-
-    // Create vector for identifying each detection window
-    std::vector<DetectionWindow> windows;
-
-    // Calculate detection window step
-    const Size2D window_step(detection_window_stride.width / hog_info.block_stride().width,
-                             detection_window_stride.height / hog_info.block_stride().height);
-
-    // Calculate number of detection windows
-    const Size2D num_windows = num_detection_windows(src.shape(), window_step, hog_info);
-
-    // Calculate detection window and row offsets in feature vector
-    const size_t src_offset_x   = window_step.width * hog_info.num_bins() * hog_info.num_cells_per_block().area();
-    const size_t src_offset_y   = window_step.height * hog_info.num_bins() * hog_info.num_cells_per_block().area() * src.shape().x();
-    const size_t src_offset_row = src.num_channels() * src.shape().x();
-
-    // Calculate detection window attributes
-    const Size2D       num_block_positions_per_detection_window = hog_info.num_block_positions_per_image(hog_info.detection_window_size());
-    const unsigned int num_bins_per_descriptor_x                = num_block_positions_per_detection_window.width * src.num_channels();
-    const unsigned int num_blocks_per_descriptor_y              = num_block_positions_per_detection_window.height;
-
-    ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog_info.descriptor_size());
-
-    size_t win_id = 0;
-
-    // Traverse feature vector in detection window steps
-    for(auto win_y = 0u, offset_y = 0u; win_y < num_windows.height; win_y += window_step.height, offset_y += src_offset_y)
-    {
-        for(auto win_x = 0u, offset_x = 0u; win_x < num_windows.width; win_x += window_step.width, offset_x += src_offset_x)
-        {
-            // Reset the score
-            float score = 0.0f;
-
-            // Traverse detection window
-            for(auto y = 0u, offset_row = 0u; y < num_blocks_per_descriptor_y; ++y, offset_row += src_offset_row)
-            {
-                const int bin_offset = y * num_bins_per_descriptor_x;
-
-                for(auto x = 0u; x < num_bins_per_descriptor_x; ++x)
-                {
-                    // Compute Linear SVM
-                    const float a = src[x + offset_x + offset_y + offset_row];
-                    const float b = descriptor[x + bin_offset];
-                    score += a * b;
-                }
-            }
-
-            // Add the bias. The bias is located at the position (descriptor_size() - 1)
-            score += descriptor[num_bins_per_descriptor_x * num_blocks_per_descriptor_y];
-
-            if(score > threshold)
-            {
-                DetectionWindow window;
-
-                if(win_id++ < max_num_detection_windows)
-                {
-                    window.x         = win_x * hog_info.block_stride().width;
-                    window.y         = win_y * hog_info.block_stride().height;
-                    window.width     = hog_info.detection_window_size().width;
-                    window.height    = hog_info.detection_window_size().height;
-                    window.idx_class = idx_class;
-                    window.score     = score;
-
-                    windows.push_back(window);
-                }
-            }
-        }
-    }
-
-    return windows;
-}
-
-template std::vector<DetectionWindow> hog_detector(const SimpleTensor<float> &src, const std::vector<float> &descriptor, unsigned int max_num_detection_windows,
-                                                   const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold, uint16_t idx_class);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/HOGMultiDetection.cpp b/tests/validation/reference/HOGMultiDetection.cpp
deleted file mode 100644
index 50d846c0be..0000000000
--- a/tests/validation/reference/HOGMultiDetection.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "HOGMultiDetection.h"
-
-#include "Derivative.h"
-#include "HOGDescriptor.h"
-#include "HOGDetector.h"
-#include "Magnitude.h"
-#include "Phase.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-void validate_models(const std::vector<HOGInfo> &models)
-{
-    ARM_COMPUTE_ERROR_ON(0 == models.size());
-
-    for(size_t i = 1; i < models.size(); ++i)
-    {
-        ARM_COMPUTE_ERROR_ON_MSG(models[0].phase_type() != models[i].phase_type(),
-                                 "All HOG parameters must have the same phase type");
-
-        ARM_COMPUTE_ERROR_ON_MSG(models[0].normalization_type() != models[i].normalization_type(),
-                                 "All HOG parameters must have the same normalization_type");
-
-        ARM_COMPUTE_ERROR_ON_MSG((models[0].l2_hyst_threshold() != models[i].l2_hyst_threshold()) && (models[0].normalization_type() == arm_compute::HOGNormType::L2HYS_NORM),
-                                 "All HOG parameters must have the same l2 hysteresis threshold if you use L2 hysteresis normalization type");
-    }
-}
-} // namespace
-
-void detection_windows_non_maxima_suppression(std::vector<DetectionWindow> &multi_windows, float min_distance)
-{
-    const size_t num_candidates = multi_windows.size();
-    size_t       num_detections = 0;
-
-    // Sort by idx_class first and by score second
-    std::sort(multi_windows.begin(), multi_windows.end(), [](const DetectionWindow & lhs, const DetectionWindow & rhs)
-    {
-        if(lhs.idx_class < rhs.idx_class)
-        {
-            return true;
-        }
-        if(rhs.idx_class < lhs.idx_class)
-        {
-            return false;
-        }
-
-        // idx_classes are equal so compare by score
-        if(lhs.score > rhs.score)
-        {
-            return true;
-        }
-        if(rhs.score > lhs.score)
-        {
-            return false;
-        }
-
-        return false;
-    });
-
-    const float min_distance_pow2 = min_distance * min_distance;
-
-    // Euclidean distance
-    for(size_t i = 0; i < num_candidates; ++i)
-    {
-        if(0.0f != multi_windows.at(i).score)
-        {
-            DetectionWindow cur;
-            cur.x         = multi_windows.at(i).x;
-            cur.y         = multi_windows.at(i).y;
-            cur.width     = multi_windows.at(i).width;
-            cur.height    = multi_windows.at(i).height;
-            cur.idx_class = multi_windows.at(i).idx_class;
-            cur.score     = multi_windows.at(i).score;
-
-            // Store window
-            multi_windows.at(num_detections) = cur;
-            ++num_detections;
-
-            const float xc = cur.x + cur.width * 0.5f;
-            const float yc = cur.y + cur.height * 0.5f;
-
-            for(size_t k = i + 1; k < (num_candidates) && (cur.idx_class == multi_windows.at(k).idx_class); ++k)
-            {
-                const float xn = multi_windows.at(k).x + multi_windows.at(k).width * 0.5f;
-                const float yn = multi_windows.at(k).y + multi_windows.at(k).height * 0.5f;
-
-                const float dx = std::fabs(xn - xc);
-                const float dy = std::fabs(yn - yc);
-
-                if(dx < min_distance && dy < min_distance)
-                {
-                    const float d = dx * dx + dy * dy;
-
-                    if(d < min_distance_pow2)
-                    {
-                        // Invalidate detection window
-                        multi_windows.at(k).score = 0.0f;
-                    }
-                }
-            }
-        }
-    }
-
-    multi_windows.resize(num_detections);
-}
-
-template <typename T>
-std::vector<DetectionWindow> hog_multi_detection(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value,
-                                                 const std::vector<HOGInfo> &models, std::vector<std::vector<float>> descriptors,
-                                                 unsigned int max_num_detection_windows, float threshold, bool non_maxima_suppression, float min_distance)
-{
-    ARM_COMPUTE_ERROR_ON(descriptors.size() != models.size());
-    validate_models(models);
-
-    const size_t width      = src.shape().x();
-    const size_t height     = src.shape().y();
-    const size_t num_models = models.size();
-
-    // Initialize previous values
-    size_t prev_num_bins     = models[0].num_bins();
-    Size2D prev_cell_size    = models[0].cell_size();
-    Size2D prev_block_size   = models[0].block_size();
-    Size2D prev_block_stride = models[0].block_stride();
-
-    std::vector<size_t> input_orient_bin;
-    std::vector<size_t> input_hog_detect;
-    std::vector<std::pair<size_t, size_t>> input_block_norm;
-
-    input_orient_bin.push_back(0);
-    input_hog_detect.push_back(0);
-    input_block_norm.emplace_back(0, 0);
-
-    // Iterate through the number of models and check if orientation binning
-    // and block normalization steps can be skipped
-    for(size_t i = 1; i < num_models; ++i)
-    {
-        size_t cur_num_bins     = models[i].num_bins();
-        Size2D cur_cell_size    = models[i].cell_size();
-        Size2D cur_block_size   = models[i].block_size();
-        Size2D cur_block_stride = models[i].block_stride();
-
-        // Check if binning and normalization steps are required
-        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
-        {
-            prev_num_bins     = cur_num_bins;
-            prev_cell_size    = cur_cell_size;
-            prev_block_size   = cur_block_size;
-            prev_block_stride = cur_block_stride;
-
-            // Compute orientation binning and block normalization. Update input to process
-            input_orient_bin.push_back(i);
-            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
-        }
-        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
-                || (cur_block_stride.height != prev_block_stride.height))
-        {
-            prev_block_size   = cur_block_size;
-            prev_block_stride = cur_block_stride;
-
-            // Compute block normalization. Update input to process
-            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
-        }
-
-        // Update input to process for hog detector
-        input_hog_detect.push_back(input_block_norm.size() - 1);
-    }
-
-    size_t num_orient_bin = input_orient_bin.size();
-    size_t num_block_norm = input_block_norm.size();
-    size_t num_hog_detect = input_hog_detect.size();
-
-    std::vector<SimpleTensor<float>> hog_spaces(num_orient_bin);
-    std::vector<SimpleTensor<float>> hog_norm_spaces(num_block_norm);
-
-    // Calculate derivative
-    SimpleTensor<int16_t> grad_x;
-    SimpleTensor<int16_t> grad_y;
-    std::tie(grad_x, grad_y) = derivative<int16_t>(src, border_mode, constant_border_value, GradientDimension::GRAD_XY);
-
-    // Calculate magnitude and phase
-    SimpleTensor<int16_t> _mag   = magnitude(grad_x, grad_y, MagnitudeType::L2NORM);
-    SimpleTensor<uint8_t> _phase = phase(grad_x, grad_y, models[0].phase_type());
-
-    // Calculate Tensors for the HOG space and orientation binning
-    for(size_t i = 0; i < num_orient_bin; ++i)
-    {
-        const size_t idx_multi_hog = input_orient_bin[i];
-
-        const size_t num_bins    = models[idx_multi_hog].num_bins();
-        const size_t num_cells_x = width / models[idx_multi_hog].cell_size().width;
-        const size_t num_cells_y = height / models[idx_multi_hog].cell_size().height;
-
-        // TensorShape of hog space
-        TensorShape hog_space_shape(num_cells_x, num_cells_y);
-
-        // Initialise HOG space
-        TensorInfo info_hog_space(hog_space_shape, num_bins, DataType::F32);
-        hog_spaces.at(i) = SimpleTensor<float>(info_hog_space.tensor_shape(), DataType::F32, info_hog_space.num_channels());
-
-        // For each cell create histogram based on magnitude and phase
-        hog_orientation_binning(_mag, _phase, hog_spaces[i], models[idx_multi_hog]);
-    }
-
-    // Calculate Tensors for the normalized HOG space and block normalization
-    for(size_t i = 0; i < num_block_norm; ++i)
-    {
-        const size_t idx_multi_hog  = input_block_norm[i].first;
-        const size_t idx_orient_bin = input_block_norm[i].second;
-
-        // Create tensor info for HOG descriptor
-        TensorInfo tensor_info(models[idx_multi_hog], src.shape().x(), src.shape().y());
-        hog_norm_spaces.at(i) = SimpleTensor<float>(tensor_info.tensor_shape(), DataType::F32, tensor_info.num_channels());
-
-        // Normalize histograms based on block size
-        hog_block_normalization(hog_norm_spaces[i], hog_spaces[idx_orient_bin], models[idx_multi_hog]);
-    }
-
-    std::vector<DetectionWindow> multi_windows;
-
-    // Calculate Detection Windows for HOG detector
-    for(size_t i = 0; i < num_hog_detect; ++i)
-    {
-        const size_t idx_block_norm = input_hog_detect[i];
-
-        // NOTE: Detection window stride fixed to block stride
-        const Size2D detection_window_stride = models[i].block_stride();
-
-        std::vector<DetectionWindow> windows = hog_detector(hog_norm_spaces[idx_block_norm], descriptors[i],
-                                                            max_num_detection_windows, models[i], detection_window_stride, threshold, i);
-
-        multi_windows.insert(multi_windows.end(), windows.begin(), windows.end());
-    }
-
-    // Suppress Non-maxima detection windows
-    if(non_maxima_suppression)
-    {
-        detection_windows_non_maxima_suppression(multi_windows, min_distance);
-    }
-
-    return multi_windows;
-}
-
-template std::vector<DetectionWindow> hog_multi_detection(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value,
-                                                          const std::vector<HOGInfo> &models, std::vector<std::vector<float>> descriptors,
-                                                          unsigned int max_num_detection_windows, float threshold, bool non_maxima_suppression, float min_distance);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/HarrisCornerDetector.cpp b/tests/validation/reference/HarrisCornerDetector.cpp
deleted file mode 100644
index 6c46b3de5d..0000000000
--- a/tests/validation/reference/HarrisCornerDetector.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "HarrisCornerDetector.h"
-
-#include "Utils.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/NonMaximaSuppression.h"
-#include "tests/validation/reference/Sobel.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-template <typename T>
-std::tuple<SimpleTensor<T>, SimpleTensor<T>, float> compute_sobel(const SimpleTensor<uint8_t> &src, int gradient_size, int block_size, BorderMode border_mode, uint8_t constant_border_value)
-{
-    SimpleTensor<T> grad_x;
-    SimpleTensor<T> grad_y;
-    float           norm_factor = 0.f;
-
-    std::tie(grad_x, grad_y) = sobel<T>(src, gradient_size, border_mode, constant_border_value, GradientDimension::GRAD_XY);
-
-    switch(gradient_size)
-    {
-        case 3:
-            norm_factor = 1.f / (4 * 255 * block_size);
-            break;
-        case 5:
-            norm_factor = 1.f / (16 * 255 * block_size);
-            break;
-        case 7:
-            norm_factor = 1.f / (64 * 255 * block_size);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Gradient size not supported.");
-    }
-
-    return std::make_tuple(grad_x, grad_y, norm_factor);
-}
-
-template <typename T, typename U>
-std::vector<KeyPoint> harris_corner_detector_impl(const SimpleTensor<U> &src, float threshold, float min_dist, float sensitivity, int gradient_size, int block_size, BorderMode border_mode,
-                                                  U constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(block_size != 3 && block_size != 5 && block_size != 7);
-
-    SimpleTensor<T> grad_x;
-    SimpleTensor<T> grad_y;
-    float           norm_factor = 0.f;
-
-    // Sobel
-    std::tie(grad_x, grad_y, norm_factor) = compute_sobel<T>(src, gradient_size, block_size, border_mode, constant_border_value);
-
-    SimpleTensor<float> scores(src.shape(), DataType::F32);
-    ValidRegion         scores_region = shape_to_valid_region(scores.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(gradient_size / 2 + block_size / 2));
-
-    // Calculate scores
-    for(int i = 0; i < scores.num_elements(); ++i)
-    {
-        Coordinates src_coord = index2coord(src.shape(), i);
-        Coordinates block_top_left{ src_coord.x() - block_size / 2, src_coord.y() - block_size / 2 };
-        Coordinates block_bottom_right{ src_coord.x() + block_size / 2, src_coord.y() + block_size / 2 };
-
-        if(!is_in_valid_region(scores_region, src_coord))
-        {
-            scores[i] = 0.f;
-            continue;
-        }
-
-        float Gx2 = 0.f;
-        float Gy2 = 0.f;
-        float Gxy = 0.f;
-
-        // Calculate Gx^2, Gy^2 and Gxy within the given window
-        for(int y = block_top_left.y(); y <= block_bottom_right.y(); ++y)
-        {
-            for(int x = block_top_left.x(); x <= block_bottom_right.x(); ++x)
-            {
-                Coordinates block_coord(x, y);
-
-                const float norm_x = tensor_elem_at(grad_x, block_coord, border_mode, static_cast<T>(constant_border_value)) * norm_factor;
-                const float norm_y = tensor_elem_at(grad_y, block_coord, border_mode, static_cast<T>(constant_border_value)) * norm_factor;
-
-                Gx2 += std::pow(norm_x, 2);
-                Gy2 += std::pow(norm_y, 2);
-                Gxy += norm_x * norm_y;
-            }
-        }
-
-        const float trace2   = std::pow(Gx2 + Gy2, 2);
-        const float det      = Gx2 * Gy2 - std::pow(Gxy, 2);
-        const float response = det - sensitivity * trace2;
-
-        if(response > threshold)
-        {
-            scores[i] = response;
-        }
-        else
-        {
-            scores[i] = 0.f;
-        }
-    }
-
-    // Suppress non-maxima candidates
-    SimpleTensor<float> suppressed_scores        = non_maxima_suppression(scores, border_mode != BorderMode::UNDEFINED ? BorderMode::CONSTANT : BorderMode::UNDEFINED, 0.f);
-    ValidRegion         suppressed_scores_region = shape_to_valid_region(suppressed_scores.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(gradient_size / 2 + block_size / 2 + 1));
-
-    // Create vector of candidate corners
-    std::vector<KeyPoint> corner_candidates;
-
-    for(int i = 0; i < suppressed_scores.num_elements(); ++i)
-    {
-        Coordinates coord = index2coord(suppressed_scores.shape(), i);
-
-        if(is_in_valid_region(suppressed_scores_region, coord) && suppressed_scores[i] != 0.f)
-        {
-            KeyPoint corner;
-            corner.x               = coord.x();
-            corner.y               = coord.y();
-            corner.tracking_status = 1;
-            corner.strength        = suppressed_scores[i];
-            corner.scale           = 0.f;
-            corner.orientation     = 0.f;
-            corner.error           = 0.f;
-
-            corner_candidates.emplace_back(corner);
-        }
-    }
-
-    // Sort descending by strength
-    std::sort(corner_candidates.begin(), corner_candidates.end(), [](const KeyPoint & a, const KeyPoint & b)
-    {
-        return a.strength > b.strength;
-    });
-
-    std::vector<KeyPoint> corners;
-    corners.reserve(corner_candidates.size());
-
-    // Only add corner if there is no stronger within min_dist
-    for(const KeyPoint &point : corner_candidates)
-    {
-        const auto strongest = std::find_if(corners.begin(), corners.end(), [&](const KeyPoint & other)
-        {
-            return std::sqrt((std::pow(point.x - other.x, 2) + std::pow(point.y - other.y, 2))) < min_dist;
-        });
-
-        if(strongest == corners.end())
-        {
-            corners.emplace_back(point);
-        }
-    }
-
-    corners.shrink_to_fit();
-
-    return corners;
-}
-} // namespace
-
-template <typename T>
-std::vector<KeyPoint> harris_corner_detector(const SimpleTensor<T> &src, float threshold, float min_dist, float sensitivity, int gradient_size, int block_size, BorderMode border_mode,
-                                             T constant_border_value)
-{
-    if(gradient_size < 7)
-    {
-        return harris_corner_detector_impl<int16_t>(src, threshold, min_dist, sensitivity, gradient_size, block_size, border_mode, constant_border_value);
-    }
-    else
-    {
-        return harris_corner_detector_impl<int32_t>(src, threshold, min_dist, sensitivity, gradient_size, block_size, border_mode, constant_border_value);
-    }
-}
-
-template std::vector<KeyPoint> harris_corner_detector(const SimpleTensor<uint8_t> &src, float threshold, float min_dist, float sensitivity, int gradient_size, int block_size, BorderMode border_mode,
-                                                      uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Histogram.cpp b/tests/validation/reference/Histogram.cpp
deleted file mode 100644
index f9c77108e1..0000000000
--- a/tests/validation/reference/Histogram.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "Histogram.h"
-
-#include "Utils.h"
-#include "arm_compute/core/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<uint32_t> histogram(const SimpleTensor<T> &src, size_t num_bins, int32_t offset, uint32_t range)
-{
-    SimpleTensor<uint32_t> dst(TensorShape(num_bins), DataType::U32);
-
-    // Clear the distribution
-    for(size_t element_idx = 0; element_idx < num_bins; ++element_idx)
-    {
-        dst[element_idx] = 0;
-    }
-
-    // Create the histogram
-    for(int element_idx = 0; element_idx < src.num_elements(); ++element_idx)
-    {
-        if((offset <= src[element_idx]) && (src[element_idx] < (offset + range)))
-        {
-            const int index = (src[element_idx] - offset) * num_bins / range;
-            dst[index]++;
-        }
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint32_t> histogram(const SimpleTensor<uint8_t> &src, size_t num_bins, int32_t offset, uint32_t range);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Histogram.h b/tests/validation/reference/Histogram.h
deleted file mode 100644
index 5f6c7d27a2..0000000000
--- a/tests/validation/reference/Histogram.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_HISTOGRAM_H
-#define ARM_COMPUTE_TEST_HISTOGRAM_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<uint32_t> histogram(const SimpleTensor<T> &src, size_t num_bins, int32_t offset, uint32_t range);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HISTOGRAM_H */
diff --git a/tests/validation/reference/IndirectConv2dAddressPrecalculation.cpp b/tests/validation/reference/IndirectConv2dAddressPrecalculation.cpp
new file mode 100644
index 0000000000..7500560c91
--- /dev/null
+++ b/tests/validation/reference/IndirectConv2dAddressPrecalculation.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "IndirectConv2dAddressPrecalculation.h"
+
+#include "arm_compute/core/Types.h"
+
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+SimpleTensor<int32_t> indirect_conv2d_addr_precalculation(const TensorShape &shape_conv_src, const TensorShape &shape_conv_wei, const TensorShape &shape_conv_dst, const TensorShape &shape_dst,
+                                                          const PadStrideInfo &conv_info)
+{
+    SimpleTensor<int32_t> out{ shape_dst, DataType::S32 };
+
+    constexpr unsigned int width_idx = 1;
+    constexpr unsigned int heigh_idx = 2;
+
+    const int src_conv_width  = static_cast<int32_t>(shape_conv_src[width_idx]); // NHWC
+    const int src_conv_height = static_cast<int32_t>(shape_conv_src[heigh_idx]); // NHWC
+    const int dst_conv_width  = static_cast<int32_t>(shape_conv_dst[width_idx]); // NHWC
+    const int wei_conv_width  = static_cast<int32_t>(shape_conv_wei[width_idx]); // NHWC
+    const int wei_conv_height = static_cast<int32_t>(shape_conv_wei[heigh_idx]); // NHWC
+    const int dst_width       = static_cast<int32_t>(shape_dst[0]);
+    const int dst_height      = static_cast<int32_t>(shape_dst[1]);
+    const int dst_batch       = static_cast<int32_t>(shape_dst[2]);
+    const int ks              = wei_conv_width * wei_conv_height;
+    const int stride_x        = static_cast<int32_t>(conv_info.stride().first);
+    const int stride_y        = static_cast<int32_t>(conv_info.stride().second);
+    const int pad_left        = static_cast<int32_t>(conv_info.pad_left());
+    const int pad_top         = static_cast<int32_t>(conv_info.pad_top());
+
+    const int m0 = dst_width / ks;
+
+    for(int z = 0; z < dst_batch; ++z)
+    {
+        for(int y = 0; y < dst_height; ++y)
+        {
+            const int mout = y * m0;
+            for(int ki = 0; ki < ks; ++ki)
+            {
+                const int xk = ki % wei_conv_width;
+                const int yk = ki / wei_conv_width;
+                for(int mi = 0; mi < m0; ++mi)
+                {
+                    int xi = ((mout + mi) % dst_conv_width) * stride_x;
+                    int yi = ((mout + mi) / dst_conv_width) * stride_y;
+                    xi -= pad_left;
+                    yi -= pad_top;
+                    const int x_s = xi + xk;
+                    const int y_s = yi + yk;
+                    int       my  = x_s + y_s * src_conv_width;
+                    my            = my + z * src_conv_width * src_conv_height;
+                    my            = x_s >= 0 ? my : -1;
+                    my            = x_s < src_conv_width ? my : -1;
+                    my            = y_s >= 0 ? my : -1;
+                    my            = y_s < src_conv_height ? my : -1;
+
+                    const unsigned int addr_out = mi + ki * m0 + y * (dst_width) + z * (dst_width * dst_height);
+                    out[addr_out]               = my;
+                }
+            }
+        }
+    }
+
+    return out;
+}
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/tests/validation/reference/YOLOLayer.h b/tests/validation/reference/IndirectConv2dAddressPrecalculation.h
index 33cf630aca..f4a90dfd9f 100644
--- a/tests/validation/reference/YOLOLayer.h
+++ b/tests/validation/reference/IndirectConv2dAddressPrecalculation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_YOLO_LAYER_H
-#define ARM_COMPUTE_TEST_YOLO_LAYER_H
+#ifndef ARM_COMPUTE_TEST_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_H
+#define ARM_COMPUTE_TEST_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_H
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -35,13 +35,10 @@ namespace validation
 {
 namespace reference
 {
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> yolo_layer(const SimpleTensor<T> &src, const ActivationLayerInfo &info, int32_t num_classes);
-
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-SimpleTensor<T> yolo_layer(const SimpleTensor<T> &src, const ActivationLayerInfo &info, int32_t num_classes);
+SimpleTensor<int32_t> indirect_conv2d_addr_precalculation(const TensorShape &shape_conv_src, const TensorShape &shape_conv_wei, const TensorShape &shape_conv_out, const TensorShape &shape_out,
+                                                          const PadStrideInfo &conv_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_YOLO_LAYER_H */
+#endif /* ARM_COMPUTE_TEST_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_H */
+\ No newline at end of file
diff --git a/tests/validation/reference/IntegralImage.cpp b/tests/validation/reference/IntegralImage.cpp
deleted file mode 100644
index 0f6a7504fe..0000000000
--- a/tests/validation/reference/IntegralImage.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "IntegralImage.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<uint32_t> integral_image(const SimpleTensor<T> &src)
-{
-    SimpleTensor<uint32_t> dst(src.shape(), DataType::U32);
-
-    // Length of dimensions
-    const size_t width  = src.shape().x();
-    const size_t height = src.shape().y();
-    const size_t depth  = src.shape().total_size_upper(2);
-
-    const size_t image_size = width * height;
-
-    for(size_t z = 0; z < depth; ++z)
-    {
-        size_t current_image = z * image_size;
-
-        //First element of each image
-        dst[current_image] = src[current_image];
-
-        // First row of each image (add only pixel on the left)
-        for(size_t x = 1; x < width; ++x)
-        {
-            dst[current_image + x] = static_cast<uint32_t>(src[current_image + x]) + dst[current_image + x - 1];
-        }
-
-        // Subsequent rows
-        for(size_t y = 1; y < height; ++y)
-        {
-            size_t current_row = current_image + (width * y);
-
-            // First element of each row (add only pixel up)
-            dst[current_row] = static_cast<uint32_t>(src[current_row]) + dst[current_row - width];
-
-            // Following row elements
-            for(size_t x = 1; x < width; ++x)
-            {
-                size_t current_pixel = current_row + x;
-
-                // out = in + up(out) + left(out) - up_left(out)
-                dst[current_pixel] = static_cast<uint32_t>(src[current_pixel]) + dst[current_pixel - 1]
-                                     + dst[current_pixel - width] - dst[current_pixel - width - 1];
-            }
-        }
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint32_t> integral_image(const SimpleTensor<uint8_t> &src);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/IntegralImage.h b/tests/validation/reference/IntegralImage.h
deleted file mode 100644
index 2c9b96a1d6..0000000000
--- a/tests/validation/reference/IntegralImage.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_INTEGRAL_IMAGE_H
-#define ARM_COMPUTE_TEST_INTEGRAL_IMAGE_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<uint32_t> integral_image(const SimpleTensor<T> &src);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_INTEGRAL_IMAGE_H */
diff --git a/tests/validation/reference/LaplacianPyramid.cpp b/tests/validation/reference/LaplacianPyramid.cpp
deleted file mode 100644
index 904b8403b3..0000000000
--- a/tests/validation/reference/LaplacianPyramid.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "LaplacianPyramid.h"
-
-#include "tests/validation/reference/ArithmeticOperations.h"
-#include "tests/validation/reference/DepthConvertLayer.h"
-#include "tests/validation/reference/Gaussian5x5.h"
-#include "tests/validation/reference/GaussianPyramidHalf.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename U>
-std::vector<SimpleTensor<U>> laplacian_pyramid(const SimpleTensor<T> &src, SimpleTensor<U> &dst, size_t num_levels, BorderMode border_mode, uint8_t constant_border_value)
-{
-    std::vector<SimpleTensor<T>> pyramid_conv;
-    std::vector<SimpleTensor<U>> pyramid_dst;
-
-    // First, a Gaussian pyramid with SCALE_PYRAMID_HALF is created
-    std::vector<SimpleTensor<T>> gaussian_level_pyramid = reference::gaussian_pyramid_half(src, border_mode, constant_border_value, num_levels);
-
-    // For each level i, the corresponding image Ii is blurred with Gaussian 5x5
-    // filter, and the difference between the two images is the corresponding
-    // level Li of the Laplacian pyramid
-    for(size_t i = 0; i < num_levels; ++i)
-    {
-        const SimpleTensor<T> level_filtered = reference::gaussian5x5(gaussian_level_pyramid[i], border_mode, constant_border_value);
-        pyramid_conv.push_back(level_filtered);
-
-        const SimpleTensor<U> level_filtered_converted = depth_convert<T, U>(level_filtered, DataType::S16, ConvertPolicy::WRAP, 0);
-        const SimpleTensor<U> gaussian_level_converted = depth_convert<T, U>(gaussian_level_pyramid[i], DataType::S16, ConvertPolicy::WRAP, 0);
-
-        const SimpleTensor<U> level_sub = reference::arithmetic_operation<U>(reference::ArithmeticOperation::SUB, gaussian_level_converted, level_filtered_converted, dst.data_type(), ConvertPolicy::WRAP);
-        pyramid_dst.push_back(level_sub);
-    }
-
-    // Return the lowest resolution image and the pyramid
-    dst = depth_convert<T, U>(pyramid_conv[num_levels - 1], DataType::S16, ConvertPolicy::WRAP, 0);
-
-    return pyramid_dst;
-}
-
-template std::vector<SimpleTensor<int16_t>> laplacian_pyramid(const SimpleTensor<uint8_t> &src, SimpleTensor<int16_t> &dst, size_t num_levels, BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/LaplacianPyramid.h b/tests/validation/reference/LaplacianPyramid.h
deleted file mode 100644
index 0596b81648..0000000000
--- a/tests/validation/reference/LaplacianPyramid.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_LAPLACIAN_PYRAMID_H
-#define ARM_COMPUTE_TEST_LAPLACIAN_PYRAMID_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename U>
-std::vector<SimpleTensor<U>> laplacian_pyramid(const SimpleTensor<T> &src, SimpleTensor<U> &dst, size_t num_levels, BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_LAPLACIAN_PYRAMID_H */
diff --git a/tests/validation/reference/LaplacianReconstruct.cpp b/tests/validation/reference/LaplacianReconstruct.cpp
deleted file mode 100644
index 2a0fcc2a65..0000000000
--- a/tests/validation/reference/LaplacianReconstruct.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "LaplacianReconstruct.h"
-
-#include "arm_compute/core/Types.h"
-#include "tests/validation/reference/ArithmeticOperations.h"
-#include "tests/validation/reference/DepthConvertLayer.h"
-#include "tests/validation/reference/Scale.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename U>
-SimpleTensor<U> laplacian_reconstruct(const std::vector<SimpleTensor<T>> &pyramid, const SimpleTensor<T> &low_res, BorderMode border_mode, T constant_border_value)
-{
-    std::vector<SimpleTensor<T>> tmp_pyramid(pyramid);
-
-    const size_t   last_level = pyramid.size() - 1;
-    const DataType data_type  = low_res.data_type();
-
-    // input + L(n-1)
-    tmp_pyramid[last_level] = reference::arithmetic_operation(reference::ArithmeticOperation::ADD, low_res, pyramid[last_level], data_type, ConvertPolicy::SATURATE);
-
-    // Scale levels n-1 to 1, and add levels n-2 to 0
-    for(size_t i = last_level; i-- > 0;)
-    {
-        const float scale_x = static_cast<float>(tmp_pyramid[i].shape().x()) / tmp_pyramid[i + 1].shape().x();
-        const float scale_y = static_cast<float>(tmp_pyramid[i].shape().y()) / tmp_pyramid[i + 1].shape().y();
-
-        tmp_pyramid[i] = reference::scale(tmp_pyramid[i + 1], scale_x, scale_y, InterpolationPolicy::NEAREST_NEIGHBOR,
-                                          border_mode, constant_border_value, SamplingPolicy::CENTER, false);
-
-        tmp_pyramid[i] = reference::arithmetic_operation(reference::ArithmeticOperation::ADD, tmp_pyramid[i], pyramid[i], data_type, ConvertPolicy::SATURATE);
-    }
-
-    return reference::depth_convert<T, U>(tmp_pyramid[0], DataType::U8, ConvertPolicy::SATURATE, 0);
-}
-
-template SimpleTensor<uint8_t> laplacian_reconstruct(const std::vector<SimpleTensor<int16_t>> &pyramid, const SimpleTensor<int16_t> &low_res, BorderMode border_mode, int16_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/LaplacianReconstruct.h b/tests/validation/reference/LaplacianReconstruct.h
deleted file mode 100644
index 8820c92983..0000000000
--- a/tests/validation/reference/LaplacianReconstruct.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_LAPLACIAN_RECONSTRUCT_H
-#define ARM_COMPUTE_TEST_LAPLACIAN_RECONSTRUCT_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename U>
-SimpleTensor<U> laplacian_reconstruct(const std::vector<SimpleTensor<T>> &pyramid, const SimpleTensor<T> &low_res, BorderMode border_mode, T constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_LAPLACIAN_RECONSTRUCT_H */
diff --git a/tests/validation/reference/Magnitude.cpp b/tests/validation/reference/Magnitude.cpp
deleted file mode 100644
index 390aaa5d48..0000000000
--- a/tests/validation/reference/Magnitude.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "Magnitude.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> magnitude(const SimpleTensor<T> &gx, const SimpleTensor<T> &gy, MagnitudeType magnitude_type)
-{
-    SimpleTensor<T> mag(gx.shape(), gx.data_type());
-
-    using intermediate_type = typename common_promoted_unsigned_type<T>::intermediate_type;
-
-    for(int i = 0; i < gx.num_elements(); ++i)
-    {
-        double val = 0.f;
-
-        if(magnitude_type == MagnitudeType::L1NORM)
-        {
-            val = static_cast<intermediate_type>(std::abs(gx[i])) + static_cast<intermediate_type>(std::abs(gy[i]));
-        }
-        else // MagnitudeType::L2NORM
-        {
-            // Note: kernel saturates to uint32_t instead of intermediate_type for S32 format
-            auto sum = static_cast<uint32_t>(gx[i] * gx[i]) + static_cast<uint32_t>(gy[i] * gy[i]);
-            val      = std::sqrt(sum) + 0.5f;
-        }
-
-        mag[i] = saturate_cast<T>(val);
-    }
-
-    return mag;
-}
-
-template SimpleTensor<int16_t> magnitude(const SimpleTensor<int16_t> &gx, const SimpleTensor<int16_t> &gy, MagnitudeType magnitude_type);
-template SimpleTensor<int32_t> magnitude(const SimpleTensor<int32_t> &gx, const SimpleTensor<int32_t> &gy, MagnitudeType magnitude_type);
-template SimpleTensor<half_float::half> magnitude(const SimpleTensor<half_float::half> &gx, const SimpleTensor<half_float::half> &gy, MagnitudeType magnitude_type);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Magnitude.h b/tests/validation/reference/Magnitude.h
deleted file mode 100644
index 81db27de20..0000000000
--- a/tests/validation/reference/Magnitude.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_MAGNITUDE_H
-#define ARM_COMPUTE_TEST_MAGNITUDE_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> magnitude(const SimpleTensor<T> &gx, const SimpleTensor<T> &gy, MagnitudeType magnitude_type);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_MAGNITUDE_H */
diff --git a/tests/validation/reference/MeanStdDevNormalizationLayer.cpp b/tests/validation/reference/MeanStdDevNormalizationLayer.cpp
index 0a23fa19bb..a7c8a784d9 100644
--- a/tests/validation/reference/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/reference/MeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,6 +63,15 @@ SimpleTensor<T> mean_std_normalization_layer(const SimpleTensor<T> &src, float e
     return dst;
 }
 
+template <>
+SimpleTensor<uint8_t> mean_std_normalization_layer(const SimpleTensor<uint8_t> &src, float epsilon)
+{
+    SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float>   dst_tmp = mean_std_normalization_layer<float>(src_tmp, epsilon);
+    SimpleTensor<uint8_t> dst     = convert_to_asymmetric<uint8_t>(dst_tmp, src.quantization_info());
+    return dst;
+}
+
 template SimpleTensor<float> mean_std_normalization_layer(const SimpleTensor<float> &src, float epsilon);
 template SimpleTensor<half> mean_std_normalization_layer(const SimpleTensor<half> &src, float epsilon);
 } // namespace reference
diff --git a/tests/validation/reference/Median3x3.cpp b/tests/validation/reference/Median3x3.cpp
deleted file mode 100644
index 55f5f62292..0000000000
--- a/tests/validation/reference/Median3x3.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-
-#include "Median3x3.h"
-#include "Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-template <typename T>
-SimpleTensor<T> median3x3(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value)
-{
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-    const int       size_tot_filter = filter_size * filter_size;
-    const uint32_t  num_elements    = src.num_elements();
-
-    for(uint32_t src_idx = 0; src_idx < num_elements; ++src_idx)
-    {
-        std::array<T, size_tot_filter> filter_elems = { { 0 } };
-        Coordinates id = index2coord(src.shape(), src_idx);
-        const int   x  = id.x();
-        const int   y  = id.y();
-
-        for(int j = y - static_cast<int>(border_size.top), index = 0; j <= y + static_cast<int>(border_size.bottom); ++j)
-        {
-            for(int i = x - static_cast<int>(border_size.left); i <= x + static_cast<int>(border_size.right); ++i, ++index)
-            {
-                id.set(0, i);
-                id.set(1, j);
-                filter_elems[index] = tensor_elem_at(src, id, border_mode, constant_border_value);
-            }
-        }
-        std::sort(filter_elems.begin(), filter_elems.end());
-        dst[src_idx] = filter_elems[size_tot_filter / 2];
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint8_t> median3x3(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Median3x3.h b/tests/validation/reference/Median3x3.h
deleted file mode 100644
index a10f428d0c..0000000000
--- a/tests/validation/reference/Median3x3.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_MEDIAN3X3_H
-#define ARM_COMPUTE_TEST_MEDIAN3X3_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> median3x3(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_MEDIAN3X3_H */
diff --git a/tests/validation/reference/NonLinearFilter.cpp b/tests/validation/reference/NonLinearFilter.cpp
deleted file mode 100644
index ada8286927..0000000000
--- a/tests/validation/reference/NonLinearFilter.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "NonLinearFilter.h"
-#include "Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> non_linear_filter(const SimpleTensor<T> &src, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, BorderMode border_mode,
-                                  uint8_t constant_border_value)
-{
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-
-    ARM_COMPUTE_ERROR_ON(pattern == MatrixPattern::OTHER && mask == nullptr);
-    ARM_COMPUTE_UNUSED(pattern);
-
-    using intermediate_type = typename common_promoted_signed_type<T>::intermediate_type;
-
-    const int                      sq_mask_size   = mask_size * mask_size;
-    const int                      half_mask_size = mask_size / 2;
-    std::vector<intermediate_type> vals(sq_mask_size);
-    intermediate_type              current_value = 0;
-
-    const ValidRegion valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(half_mask_size));
-    const uint32_t    num_elements = src.num_elements();
-
-    for(uint32_t element_idx = 0, count = 0, index = 0; element_idx < num_elements; ++element_idx, count = 0, index = 0)
-    {
-        Coordinates id = index2coord(src.shape(), element_idx);
-        if(is_in_valid_region(valid_region, id))
-        {
-            int idx = id.x();
-            int idy = id.y();
-            for(int y = idy - half_mask_size; y <= idy + half_mask_size; ++y)
-            {
-                for(int x = idx - half_mask_size; x <= idx + half_mask_size; ++x, ++index)
-                {
-                    id.set(0, x);
-                    id.set(1, y);
-                    current_value = tensor_elem_at(src, id, border_mode, constant_border_value);
-
-                    if(mask[index] == 255)
-                    {
-                        vals[count] = static_cast<intermediate_type>(current_value);
-                        ++count;
-                    }
-                }
-            }
-            std::sort(vals.begin(), vals.begin() + count);
-
-            ARM_COMPUTE_ERROR_ON(count == 0);
-
-            switch(function)
-            {
-                case NonLinearFilterFunction::MIN:
-                    dst[element_idx] = saturate_cast<T>(vals[0]);
-                    break;
-                case NonLinearFilterFunction::MAX:
-                    dst[element_idx] = saturate_cast<T>(vals[count - 1]);
-                    break;
-                case NonLinearFilterFunction::MEDIAN:
-                    dst[element_idx] = saturate_cast<T>(vals[count / 2]);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported NonLinearFilter function.");
-            }
-        }
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint8_t> non_linear_filter(const SimpleTensor<uint8_t> &src, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                                 BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/NonLinearFilter.h b/tests/validation/reference/NonLinearFilter.h
deleted file mode 100644
index ecf6563a15..0000000000
--- a/tests/validation/reference/NonLinearFilter.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_BITWISE_NOT_H
-#define ARM_COMPUTE_TEST_BITWISE_NOT_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> non_linear_filter(const SimpleTensor<T> &src, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, BorderMode border_mode,
-                                  uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_BITWISE_NOT_H */
diff --git a/tests/validation/reference/OpticalFlow.cpp b/tests/validation/reference/OpticalFlow.cpp
deleted file mode 100644
index 0a04214045..0000000000
--- a/tests/validation/reference/OpticalFlow.cpp
+++ /dev/null
@@ -1,404 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "OpticalFlow.h"
-
-#include "GaussianPyramidHalf.h"
-#include "Scharr.h"
-#include "Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-using KeyPointArray         = std::vector<KeyPoint>;
-using InternalKeyPointArray = std::vector<InternalKeyPoint>;
-
-// Constants used for Lucas-Kanade Algorithm
-constexpr int   W_BITS                = 14;
-constexpr float D0                    = 1 << W_BITS;
-constexpr float DETERMINANT_THRESHOLD = 1.0e-07f;
-constexpr float EIGENVALUE_THRESHOLD  = 1.0e-04f;
-constexpr float FLT_SCALE             = 1.0f / (1 << 20);
-
-// Creates an InternalKeyPointArray for tracking non-integral pixel coordinates
-InternalKeyPointArray create_internal_keypoints(const KeyPointArray &keypoints)
-{
-    InternalKeyPointArray internal_keypoints;
-
-    for(auto keypoint : keypoints)
-    {
-        InternalKeyPoint internal_keypoint;
-
-        internal_keypoint.x               = static_cast<float>(keypoint.x);
-        internal_keypoint.y               = static_cast<float>(keypoint.y);
-        internal_keypoint.tracking_status = static_cast<bool>(keypoint.tracking_status);
-
-        internal_keypoints.push_back(internal_keypoint);
-    }
-
-    return internal_keypoints;
-}
-
-// Scale tracked points based on Pyramid level
-void scale_tracked_points(size_t level, size_t num_levels, bool use_initial_estimate,
-                          InternalKeyPointArray &old_points_internal, InternalKeyPointArray &new_points_internal,
-                          const KeyPointArray &old_points, const KeyPointArray &new_points_estimates)
-{
-    if(level == num_levels - 1) // lowest resolution
-    {
-        const float scale = std::pow(SCALE_PYRAMID_HALF, level);
-
-        for(size_t i = 0; i < old_points.size(); ++i)
-        {
-            old_points_internal.at(i).x               = old_points.at(i).x * scale;
-            old_points_internal.at(i).y               = old_points.at(i).y * scale;
-            old_points_internal.at(i).tracking_status = true;
-
-            InternalKeyPoint keypoint_to_track;
-
-            if(use_initial_estimate)
-            {
-                keypoint_to_track.x               = new_points_estimates.at(i).x * scale;
-                keypoint_to_track.y               = new_points_estimates.at(i).y * scale;
-                keypoint_to_track.tracking_status = (new_points_estimates.at(i).tracking_status == 1);
-            }
-            else
-            {
-                keypoint_to_track.x               = old_points_internal.at(i).x;
-                keypoint_to_track.y               = old_points_internal.at(i).y;
-                keypoint_to_track.tracking_status = true;
-            }
-
-            new_points_internal.at(i) = keypoint_to_track;
-        }
-    }
-    else
-    {
-        for(size_t i = 0; i < old_points.size(); ++i)
-        {
-            old_points_internal.at(i).x /= SCALE_PYRAMID_HALF;
-            old_points_internal.at(i).y /= SCALE_PYRAMID_HALF;
-            new_points_internal.at(i).x /= SCALE_PYRAMID_HALF;
-            new_points_internal.at(i).y /= SCALE_PYRAMID_HALF;
-        }
-    }
-}
-
-bool is_invalid_keypoint(const InternalKeyPoint &keypoint, const ValidRegion &valid_region, size_t window_dimension)
-{
-    const int half_window = window_dimension / 2;
-    const int x           = std::floor(keypoint.x);
-    const int y           = std::floor(keypoint.y);
-
-    return (x - half_window < valid_region.start(0)) || (x + half_window >= valid_region.end(0) - 1) || (y - half_window < valid_region.start(1)) || (y + half_window >= valid_region.end(1) - 1);
-}
-
-template <typename T>
-constexpr int INT_ROUND(T x, int n)
-{
-    return (x + (1 << (n - 1))) >> n;
-}
-
-// Return the bilinear value at a specified coordinate with different border modes
-template <typename T>
-int bilinear_interpolate(const SimpleTensor<T> &in, Coordinates id, float wx, float wy, BorderMode border_mode, T constant_border_value, int scale)
-{
-    const int level = id.x();
-    const int idy   = id.y();
-
-    const float dx   = wx;
-    const float dy   = wy;
-    const float dx_1 = 1.0f - dx;
-    const float dy_1 = 1.0f - dy;
-
-    const T border_value = constant_border_value;
-
-    id.set(0, level);
-    id.set(1, idy);
-    const T tl = tensor_elem_at(in, id, border_mode, border_value);
-    id.set(0, level + 1);
-    id.set(1, idy);
-    const T tr = tensor_elem_at(in, id, border_mode, border_value);
-    id.set(0, level);
-    id.set(1, idy + 1);
-    const T bl = tensor_elem_at(in, id, border_mode, border_value);
-    id.set(0, level + 1);
-    id.set(1, idy + 1);
-    const T br = tensor_elem_at(in, id, border_mode, border_value);
-
-    // weights
-    const int w00 = roundf(dx_1 * dy_1 * D0);
-    const int w01 = roundf(dx * dy_1 * D0);
-    const int w10 = roundf(dx_1 * dy * D0);
-    const int w11 = D0 - w00 - w01 - w10;
-
-    return static_cast<int>(INT_ROUND(tl * w00 + tr * w01 + bl * w10 + br * w11, scale));
-}
-
-template <typename T>
-std::vector<int> compute_derivative(const SimpleTensor<T> &input, const InternalKeyPoint &keypoint,
-                                    BorderMode border_mode, uint8_t constant_border_value, size_t window_dimension, int scale)
-{
-    std::vector<int> bilinear_values;
-
-    const int half_window = window_dimension / 2;
-
-    float keypoint_int_x = 0;
-    float keypoint_int_y = 0;
-
-    const float wx = std::modf(keypoint.x, &keypoint_int_x);
-    const float wy = std::modf(keypoint.y, &keypoint_int_y);
-
-    Coordinates tl_window(static_cast<int>(keypoint_int_x) - half_window, static_cast<int>(keypoint_int_y) - half_window);
-    Coordinates br_window(static_cast<int>(keypoint_int_x) + half_window, static_cast<int>(keypoint_int_y) + half_window);
-
-    for(int y = tl_window.y(); y <= br_window.y(); ++y)
-    {
-        for(int x = tl_window.x(); x <= br_window.x(); ++x)
-        {
-            bilinear_values.push_back(bilinear_interpolate(input, Coordinates(x, y), wx, wy, border_mode, static_cast<T>(constant_border_value), scale));
-        }
-    }
-
-    return bilinear_values;
-}
-
-std::tuple<float, float, float> compute_spatial_gradient_matrix(const std::vector<int> &bilinear_ix, const std::vector<int> &bilinear_iy)
-{
-    ARM_COMPUTE_ERROR_ON(bilinear_ix.size() != bilinear_iy.size());
-
-    int iA11 = 0;
-    int iA12 = 0;
-    int iA22 = 0;
-
-    for(size_t i = 0; i < bilinear_ix.size(); ++i)
-    {
-        int ixval = bilinear_ix[i];
-        int iyval = bilinear_iy[i];
-
-        iA11 += ixval * ixval;
-        iA12 += ixval * iyval;
-        iA22 += iyval * iyval;
-    }
-
-    return std::make_tuple(iA11 * FLT_SCALE, iA12 * FLT_SCALE, iA22 * FLT_SCALE);
-}
-
-std::tuple<double, double> compute_temporal_gradient_vector(const std::vector<int> &bilinear_it_old,
-                                                            const std::vector<int> &bilinear_it_new,
-                                                            const std::vector<int> &bilinear_ix,
-                                                            const std::vector<int> &bilinear_iy)
-{
-    ARM_COMPUTE_ERROR_ON(bilinear_ix.size() != bilinear_iy.size());
-    ARM_COMPUTE_ERROR_ON(bilinear_it_old.size() != bilinear_it_new.size());
-
-    int ib1 = 0;
-    int ib2 = 0;
-
-    for(size_t i = 0; i < bilinear_ix.size(); ++i)
-    {
-        int ixval = bilinear_ix[i];
-        int iyval = bilinear_iy[i];
-        int ival  = bilinear_it_old[i];
-        int jval  = bilinear_it_new[i];
-
-        const int diff = jval - ival;
-
-        ib1 += diff * ixval;
-        ib2 += diff * iyval;
-    }
-
-    const double b1 = ib1 * FLT_SCALE;
-    const double b2 = ib2 * FLT_SCALE;
-
-    return std::make_tuple(b1, b2);
-}
-} // namespace
-
-template <typename T>
-std::vector<KeyPoint> optical_flow(const SimpleTensor<T> &old_input, const SimpleTensor<T> &new_input,
-                                   const OpticalFlowParameters &params, size_t num_levels,
-                                   const std::vector<KeyPoint> &old_points, const std::vector<KeyPoint> &new_points_estimates,
-                                   BorderMode border_mode, uint8_t constant_border_value)
-{
-    const int    filter_size      = 3;    // scharr filter size
-    const size_t max_iterations   = 1000; // fixed by kernel
-    const size_t window_dimension = params.window_dimension;
-    const size_t num_iterations   = (params.termination == Termination::TERM_CRITERIA_EPSILON) ? max_iterations : params.num_iterations;
-
-    KeyPointArray new_points(old_points.size());
-
-    InternalKeyPointArray old_points_internal = create_internal_keypoints(old_points);
-    InternalKeyPointArray new_points_internal = create_internal_keypoints(new_points_estimates);
-
-    SimpleTensor<int16_t> scharr_gx;
-    SimpleTensor<int16_t> scharr_gy;
-
-    // Create pyramids
-    std::vector<SimpleTensor<T>> old_pyramid = gaussian_pyramid_half(old_input, border_mode, constant_border_value, num_levels);
-    std::vector<SimpleTensor<T>> new_pyramid = gaussian_pyramid_half(new_input, border_mode, constant_border_value, num_levels);
-
-    // Iterate over each level of the pyramid
-    for(size_t idx = num_levels; idx > 0; --idx)
-    {
-        const size_t level = idx - 1;
-
-        // Calculate scharr gradients
-        std::tie(scharr_gx, scharr_gy) = scharr<int16_t, T>(old_pyramid[level], filter_size, border_mode, constant_border_value, GradientDimension::GRAD_XY);
-
-        scale_tracked_points(level, num_levels, params.use_initial_estimate, old_points_internal, new_points_internal, old_points, new_points_estimates);
-
-        // Calculate valid region based on image dimensions of current pyramid level
-        const ValidRegion valid_region = shape_to_valid_region(old_pyramid[level].shape(), (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-
-        for(size_t i = 0; i < old_points.size(); ++i)
-        {
-            InternalKeyPoint &old_keypoint = old_points_internal.at(i);
-            InternalKeyPoint &new_keypoint = new_points_internal.at(i);
-
-            // Helper function for untracking keypoints when on the lowest pyramid level (high resolution)
-            const auto untrack_keypoint = [&](bool predicate)
-            {
-                if(predicate && (level == 0))
-                {
-                    new_keypoint.tracking_status = false;
-                    return true;
-                }
-                return predicate;
-            };
-
-            if(!old_keypoint.tracking_status)
-            {
-                continue;
-            }
-
-            // Check if tracked coordinate is outside image coordinate
-            if(untrack_keypoint(is_invalid_keypoint(old_keypoint, valid_region, window_dimension)))
-            {
-                continue;
-            }
-
-            // Compute spatial derivative
-            std::vector<int> bilinear_ix = compute_derivative(scharr_gx, old_keypoint, border_mode, constant_border_value, window_dimension, W_BITS);
-            std::vector<int> bilinear_iy = compute_derivative(scharr_gy, old_keypoint, border_mode, constant_border_value, window_dimension, W_BITS);
-
-            float A11 = 0.f;
-            float A12 = 0.f;
-            float A22 = 0.f;
-            std::tie(A11, A12, A22) = compute_spatial_gradient_matrix(bilinear_ix, bilinear_iy);
-
-            // Calculate criteria for lost tracking : Matrix A is invertible
-            // 1. The determinant of the matrix is less than DETERMINANT_THRESHOLD
-            // 2. The minimum eigenvalue of the matrix is less than EIGENVALUE_THRESHOLD
-            const float trace_A      = A11 + A22;
-            const float determinant  = A11 * A22 - A12 * A12;
-            const float discriminant = (trace_A * trace_A) - 4.0f * (determinant);
-            const float eigenvalue_A = (trace_A - std::sqrt(discriminant)) / 2.0f;
-
-            // Divide by window_dimension squared to reduce the floating point accummulation error
-            const float eigenvalue = eigenvalue_A / (window_dimension * window_dimension);
-
-            // Check if it is a good point to track
-            if(untrack_keypoint(eigenvalue < EIGENVALUE_THRESHOLD || determinant < DETERMINANT_THRESHOLD))
-            {
-                continue;
-            }
-
-            float prev_delta_x = 0.f;
-            float prev_delta_y = 0.f;
-
-            for(size_t j = 0; j < num_iterations; ++j)
-            {
-                // Check if tracked coordinate is outside image coordinate
-                if(untrack_keypoint(is_invalid_keypoint(new_keypoint, valid_region, window_dimension)))
-                {
-                    break;
-                }
-
-                // Compute temporal derivative
-                std::vector<int> bilinear_it_old = compute_derivative(old_pyramid[level], old_keypoint, border_mode, constant_border_value, window_dimension, W_BITS - 5);
-                std::vector<int> bilinear_it_new = compute_derivative(new_pyramid[level], new_keypoint, border_mode, constant_border_value, window_dimension, W_BITS - 5);
-
-                double b1 = 0.f;
-                double b2 = 0.f;
-                std::tie(b1, b2) = compute_temporal_gradient_vector(bilinear_it_old, bilinear_it_new, bilinear_ix, bilinear_iy);
-
-                // Compute motion vector -> A^-1 * -b
-                const float delta_x = (A12 * b2 - A22 * b1) / determinant;
-                const float delta_y = (A12 * b1 - A11 * b2) / determinant;
-
-                // Update the new position
-                new_keypoint.x += delta_x;
-                new_keypoint.y += delta_y;
-
-                const float magnitude_squared = delta_x * delta_x + delta_y * delta_y;
-
-                // Check if termination criteria is EPSILON and if it is satisfied
-                if(magnitude_squared <= params.epsilon && (params.termination == Termination::TERM_CRITERIA_EPSILON || params.termination == Termination::TERM_CRITERIA_BOTH))
-                {
-                    break;
-                }
-
-                // Check convergence analyzing the previous delta
-                if(j > 0 && (std::fabs(delta_x + prev_delta_x) < 0.01f && std::fabs(delta_y + prev_delta_y) < 0.01f))
-                {
-                    new_keypoint.x -= delta_x * SCALE_PYRAMID_HALF;
-                    new_keypoint.y -= delta_y * SCALE_PYRAMID_HALF;
-
-                    break;
-                }
-
-                prev_delta_x = delta_x;
-                prev_delta_y = delta_y;
-            }
-        }
-    }
-
-    // Copy optical flow coordinates to output vector
-    for(size_t i = 0; i < old_points.size(); ++i)
-    {
-        const InternalKeyPoint &new_keypoint = new_points_internal.at(i);
-
-        new_points.at(i).x               = roundf(new_keypoint.x);
-        new_points.at(i).y               = roundf(new_keypoint.y);
-        new_points.at(i).tracking_status = new_keypoint.tracking_status ? 1 : 0;
-    }
-
-    return new_points;
-}
-
-template std::vector<KeyPoint> optical_flow(const SimpleTensor<uint8_t> &old_input, const SimpleTensor<uint8_t> &new_input,
-                                            const OpticalFlowParameters &params, size_t num_levels,
-                                            const std::vector<KeyPoint> &old_points, const std::vector<KeyPoint> &new_points_estimates,
-                                            BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/OpticalFlow.h b/tests/validation/reference/OpticalFlow.h
deleted file mode 100644
index 1bc367ab6a..0000000000
--- a/tests/validation/reference/OpticalFlow.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_OPTICAL_FLOW_H
-#define ARM_COMPUTE_TEST_OPTICAL_FLOW_H
-
-#include "tests/SimpleTensor.h"
-#include "tests/Types.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-std::vector<KeyPoint> optical_flow(const SimpleTensor<T> &old_input, const SimpleTensor<T> &new_input,
-                                   const OpticalFlowParameters &params, size_t num_levels,
-                                   const std::vector<KeyPoint> &old_points, const std::vector<KeyPoint> &new_points_estimates,
-                                   BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_OPTICAL_FLOW_H */
diff --git a/tests/validation/reference/Permute.cpp b/tests/validation/reference/Permute.cpp
index 6f122b1bf5..7aa3011d8f 100644
--- a/tests/validation/reference/Permute.cpp
+++ b/tests/validation/reference/Permute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "Permute.h"
 
 #include "arm_compute/core/Types.h"
+
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -42,11 +43,11 @@ SimpleTensor<T> permute(const SimpleTensor<T> &src, PermutationVector perm)
     permute(dst_shape, perm);
 
     // Create reference
-    SimpleTensor<T> dst{ dst_shape, src.data_type(), src.num_channels(), src.quantization_info() };
+    SimpleTensor<T> dst{dst_shape, src.data_type(), src.num_channels(), src.quantization_info()};
 
     // Compute reference
     const uint32_t num_elements = src.num_elements();
-    for(uint32_t i = 0; i < num_elements; ++i)
+    for (uint32_t i = 0; i < num_elements; ++i)
     {
         const Coordinates src_coords = index2coord(src.shape(), i);
         Coordinates       dst_coords = src_coords;
@@ -58,13 +59,14 @@ SimpleTensor<T> permute(const SimpleTensor<T> &src, PermutationVector perm)
     return dst;
 }
 
-template SimpleTensor<int8_t> permute(const SimpleTensor<int8_t> &src, PermutationVector perm);
-template SimpleTensor<uint8_t> permute(const SimpleTensor<uint8_t> &src, PermutationVector perm);
-template SimpleTensor<int16_t> permute(const SimpleTensor<int16_t> &src, PermutationVector perm);
+template SimpleTensor<int8_t>   permute(const SimpleTensor<int8_t> &src, PermutationVector perm);
+template SimpleTensor<uint8_t>  permute(const SimpleTensor<uint8_t> &src, PermutationVector perm);
+template SimpleTensor<int16_t>  permute(const SimpleTensor<int16_t> &src, PermutationVector perm);
 template SimpleTensor<uint16_t> permute(const SimpleTensor<uint16_t> &src, PermutationVector perm);
 template SimpleTensor<uint32_t> permute(const SimpleTensor<uint32_t> &src, PermutationVector perm);
-template SimpleTensor<float> permute(const SimpleTensor<float> &src, PermutationVector perm);
-template SimpleTensor<half> permute(const SimpleTensor<half> &src, PermutationVector perm);
+template SimpleTensor<float>    permute(const SimpleTensor<float> &src, PermutationVector perm);
+template SimpleTensor<half>     permute(const SimpleTensor<half> &src, PermutationVector perm);
+template SimpleTensor<bfloat16> permute(const SimpleTensor<bfloat16> &src, PermutationVector perm);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Phase.cpp b/tests/validation/reference/Phase.cpp
deleted file mode 100644
index 228f73bb37..0000000000
--- a/tests/validation/reference/Phase.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "Phase.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<uint8_t> phase(const SimpleTensor<T> &gx, const SimpleTensor<T> &gy, PhaseType phase_type)
-{
-    const float           PI = std::atan(1) * 4;
-    SimpleTensor<uint8_t> phase(gx.shape(), DataType::U8);
-
-    if(phase_type == PhaseType::UNSIGNED) // unsigned: map to [0-255)
-    {
-        for(int i = 0; i < gx.num_elements(); ++i)
-        {
-            float angle_deg = (std::atan2(float(gy[i]), float(gx[i])) / PI) * 180.0f;
-            phase[i]        = (angle_deg < 0.0f) ? 180.f + angle_deg : angle_deg;
-        }
-    }
-    else // signed: map to [0-180) degrees
-    {
-        for(int i = 0; i < gx.num_elements(); ++i)
-        {
-            float angle_pi = std::atan2(gy[i], gx[i]) / PI;
-            angle_pi       = (angle_pi < 0.0f) ? 2 + angle_pi : angle_pi;
-            phase[i]       = lround(angle_pi * 128) & 0xFFu;
-        }
-    }
-
-    return phase;
-}
-
-template SimpleTensor<uint8_t> phase(const SimpleTensor<int16_t> &gx, const SimpleTensor<int16_t> &gy, PhaseType phase_type);
-template SimpleTensor<uint8_t> phase(const SimpleTensor<int32_t> &gx, const SimpleTensor<int32_t> &gy, PhaseType phase_type);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Phase.h b/tests/validation/reference/Phase.h
deleted file mode 100644
index 436c280f0a..0000000000
--- a/tests/validation/reference/Phase.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_PHASE_H
-#define ARM_COMPUTE_TEST_PHASE_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<uint8_t> phase(const SimpleTensor<T> &gx, const SimpleTensor<T> &gy, PhaseType phase_type);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_PHASE_H */
diff --git a/tests/validation/reference/Pooling3dLayer.cpp b/tests/validation/reference/Pooling3dLayer.cpp
new file mode 100644
index 0000000000..2e8f3a0b92
--- /dev/null
+++ b/tests/validation/reference/Pooling3dLayer.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "Pooling3dLayer.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+using namespace arm_compute::misc::shape_calculator;
+
+template <typename T>
+SimpleTensor<T> pooling_3d_layer_internal(const SimpleTensor<T> &src, const Pooling3dLayerInfo &pool3d_info, SimpleTensor<uint32_t> *indices)
+{
+    TensorShape     pooled_shape = compute_pool3d_shape(src.shape(), pool3d_info);
+    SimpleTensor<T> dst{ pooled_shape, src.data_type(), 1 };
+
+    if(indices != nullptr)
+    {
+        *indices = SimpleTensor<uint32_t> { pooled_shape, DataType::U32, 1 };
+    }
+
+    const int idx_channel = 0;
+    const int idx_width   = 1;
+    const int idx_height  = 2;
+    const int idx_depth   = 3;
+    const int idx_batch   = 4;
+
+    const int pool_size_width  = pool3d_info.is_global_pooling ? src.shape()[idx_width] : pool3d_info.pool_size.width;
+    const int pool_size_height = pool3d_info.is_global_pooling ? src.shape()[idx_height] : pool3d_info.pool_size.height;
+    const int pool_size_depth  = pool3d_info.is_global_pooling ? src.shape()[idx_depth] : pool3d_info.pool_size.depth;
+
+    const int pool_stride_width  = static_cast<int>(pool3d_info.stride.width);
+    const int pool_stride_height = static_cast<int>(pool3d_info.stride.height);
+    const int pool_stride_depth  = static_cast<int>(pool3d_info.stride.depth);
+
+    const int pad_left  = static_cast<int>(pool3d_info.padding.left);
+    const int pad_top   = static_cast<int>(pool3d_info.padding.top);
+    const int pad_front = static_cast<int>(pool3d_info.padding.front);
+
+    const int pad_right  = static_cast<int>(pool3d_info.padding.right);
+    const int pad_bottom = static_cast<int>(pool3d_info.padding.bottom);
+    const int pad_back   = static_cast<int>(pool3d_info.padding.back);
+
+    const int num_channels = static_cast<int>(src.shape()[idx_channel]);
+    const int num_batches  = static_cast<int>(src.shape()[idx_batch]);
+
+    ARM_COMPUTE_ERROR_ON(num_channels != static_cast<int>(dst.shape()[idx_channel]));
+    ARM_COMPUTE_ERROR_ON(num_batches != static_cast<int>(dst.shape()[idx_batch]));
+
+    const int w_src = static_cast<int>(src.shape()[idx_width]);
+    const int h_src = static_cast<int>(src.shape()[idx_height]);
+    const int d_src = static_cast<int>(src.shape()[idx_depth]);
+    const int w_dst = static_cast<int>(dst.shape()[idx_width]);
+    const int h_dst = static_cast<int>(dst.shape()[idx_height]);
+    const int d_dst = static_cast<int>(dst.shape()[idx_depth]);
+
+    const bool exclude_padding = pool3d_info.exclude_padding;
+
+    const int height_stride_src = num_channels * w_src;
+    const int depth_stride_src  = height_stride_src * h_src;
+    const int batch_stride_src  = depth_stride_src * d_src;
+    const int height_stride_dst = num_channels * w_dst;
+    const int depth_stride_dst  = height_stride_dst * h_dst;
+    const int batch_stride_dst  = depth_stride_dst * d_dst;
+
+    for(int b = 0; b < num_batches; ++b)
+    {
+        const int batch_offset_dst = b * batch_stride_dst;
+        const int batch_offset_src = b * batch_stride_src;
+        for(int c = 0; c < num_channels; ++c)
+        {
+            for(int d = 0; d < d_dst; ++d)
+            {
+                const int depth_offset_dst = d * depth_stride_dst;
+                for(int h = 0; h < h_dst; ++h)
+                {
+                    const int height_offset_dst = h * height_stride_dst;
+                    for(int w = 0; w < w_dst; ++w)
+                    {
+                        int wstart = w * pool_stride_width - pad_left;
+                        int hstart = h * pool_stride_height - pad_top;
+                        int dstart = d * pool_stride_depth - pad_front;
+                        int wend   = std::min(wstart + pool_size_width, w_src + pad_right);
+                        int hend   = std::min(hstart + pool_size_height, h_src + pad_bottom);
+                        int dend   = std::min(dstart + pool_size_depth, d_src + pad_back);
+
+                        // this may not be equal to pool_w * pool_h * pool_d because of
+                        // DimensionRoundingType choice (CEIL)
+                        int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+
+                        // limit [start, end) to [0, w_src)
+                        wstart = std::max(wstart, 0);
+                        hstart = std::max(hstart, 0);
+                        dstart = std::max(dstart, 0);
+                        wend   = std::min(wend, w_src);
+                        hend   = std::min(hend, h_src);
+                        dend   = std::min(dend, d_src);
+
+                        auto max_val = -std::numeric_limits<T>::infinity();
+                        int  max_index{ 0 };
+                        T    avg_val = static_cast<T>(0.f);
+                        T    l2_val  = static_cast<T>(0.f);
+
+                        if(exclude_padding)
+                        {
+                            pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                        }
+
+                        for(int z = dstart; z < dend; ++z)
+                        {
+                            const int depth_offset_src = z * depth_stride_src;
+                            for(int y = hstart; y < hend; ++y)
+                            {
+                                const int height_offset_src = y * height_stride_src;
+                                for(int x = wstart; x < wend; ++x)
+                                {
+                                    const auto val = static_cast<T>(
+                                                         src[batch_offset_src + depth_offset_src + height_offset_src + x * num_channels + c]);
+                                    if(val > max_val)
+                                    {
+                                        max_val   = val;
+                                        max_index = coord2index(src.shape(), Coordinates(c, x, y, z, 0));
+                                    }
+
+                                    avg_val += val;
+                                    l2_val += val * val;
+                                }
+                            }
+                        }
+
+                        avg_val /= pool_size;
+                        l2_val = static_cast<T>(std::sqrt(l2_val / pool_size));
+
+                        int dst_index = batch_offset_dst + depth_offset_dst + height_offset_dst + w * num_channels + c;
+                        switch(pool3d_info.pool_type)
+                        {
+                            case PoolingType::MAX:
+                                dst[dst_index] = static_cast<T>(max_val);
+                                break;
+                            case PoolingType::AVG:
+                                dst[dst_index] = static_cast<T>(avg_val);
+                                break;
+                            case PoolingType::L2:
+                                dst[dst_index] = static_cast<T>(l2_val);
+                                break;
+                            default:
+                                ARM_COMPUTE_ERROR("Pooling Type should be either MAX, AVG or L2");
+                        }
+
+                        if(indices != nullptr)
+                        {
+                            (*indices)[dst_index] = max_index;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return dst;
+}
+
+template SimpleTensor<float> pooling_3d_layer(const SimpleTensor<float> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices);
+template SimpleTensor<half> pooling_3d_layer(const SimpleTensor<half> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices);
+
+template <typename T>
+SimpleTensor<T> pooling_3d_layer(const SimpleTensor<T> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices)
+{
+    ARM_COMPUTE_UNUSED(output_qinfo);
+    return pooling_3d_layer_internal<T>(src, pool3d_info, indices);
+}
+
+template <>
+SimpleTensor<int8_t> pooling_3d_layer<int8_t>(const SimpleTensor<int8_t> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices)
+{
+    SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float> dst_tmp = pooling_3d_layer_internal<float>(src_tmp, pool3d_info, indices);
+    return convert_to_asymmetric<int8_t>(dst_tmp, output_qinfo);
+}
+
+template <>
+SimpleTensor<uint8_t> pooling_3d_layer<uint8_t>(const SimpleTensor<uint8_t> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices)
+{
+    SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float> dst_tmp = pooling_3d_layer_internal<float>(src_tmp, pool3d_info, indices);
+    return convert_to_asymmetric<uint8_t>(dst_tmp, output_qinfo);
+}
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/HarrisCornerDetector.h b/tests/validation/reference/Pooling3dLayer.h
index 2f464749f6..481a0d3024 100644
--- a/tests/validation/reference/HarrisCornerDetector.h
+++ b/tests/validation/reference/Pooling3dLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_HARRIS_CORNER_DETECTOR_H
-#define ARM_COMPUTE_TEST_HARRIS_CORNER_DETECTOR_H
+#ifndef ARM_COMPUTE_TEST_POOL3D_LAYER_H
+#define ARM_COMPUTE_TEST_POOL3D_LAYER_H
 
+#include "Utils.h"
 #include "arm_compute/core/Types.h"
 #include "tests/SimpleTensor.h"
-
-#include <vector>
+#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -38,11 +38,13 @@ namespace validation
 namespace reference
 {
 template <typename T>
-std::vector<KeyPoint> harris_corner_detector(const SimpleTensor<T> &src,
-                                             float threshold, float min_dist, float sensitivity, int gradient_size, int block_size,
-                                             BorderMode border_mode, T constant_border_value = 0);
+SimpleTensor<T> pooling_3d_layer_internal(const SimpleTensor<T> &src, const Pooling3dLayerInfo &pool3d_info, SimpleTensor<uint32_t> *indices = nullptr);
+
+template <typename T>
+SimpleTensor<T> pooling_3d_layer(const SimpleTensor<T> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo = QuantizationInfo(),
+                                 SimpleTensor<uint32_t> *indices = nullptr);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HARRIS_CORNER_DETECTOR_H */
+#endif /* ARM_COMPUTE_TEST_POOL3D_LAYER_H */
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index 5f4edfe49c..bf7bd0c1df 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,6 @@ using namespace arm_compute::misc::shape_calculator;
 template <typename T, typename ACC_T, typename std::enable_if<is_floating_point<T>::value, int>::type>
 SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info, SimpleTensor<uint32_t> *indices, DataLayout data_layout)
 {
-    ARM_COMPUTE_ERROR_ON(info.is_global_pooling && (src.shape().x() != src.shape().y()));
     // Create reference
     SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info), src.data_type(), 1 };
     auto            pooled_shape = compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info);
@@ -84,20 +83,28 @@ SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const Pooling
                     {
                         int wstart   = w * pool_stride_x - pad_left;
                         int hstart   = h * pool_stride_y - pad_top;
+
+                        // Used to calculate kernel indices
+                        int  kh_start = std::max(0, -hstart);
+                        int  kw_start = std::max(0, -wstart);
+                        int  max_ker_index{ 0 };
+
                         int wend     = std::min(wstart + pool_size_x, w_src);
                         int hend     = std::min(hstart + pool_size_y, h_src);
                         wstart       = std::max(wstart, 0);
                         hstart       = std::max(hstart, 0);
-                        auto max_val = std::numeric_limits<ACC_T>::lowest();
+                        auto max_val = info.use_inf_as_limit ? -std::numeric_limits<ACC_T>::infinity() : std::numeric_limits<ACC_T>::lowest();
                         int  max_index{ 0 };
-                        for(int y = hstart; y < hend; ++y)
+
+                        for(int y = hstart, kh = kh_start; y < hend; ++y, ++kh)
                         {
-                            for(int x = wstart; x < wend; ++x)
+                            for(int x = wstart, kw = kw_start; x < wend; ++x, ++kw)
                             {
                                 const auto val = static_cast<ACC_T>(src[b * z_src * h_src * w_src + r * h_src * w_src + y * w_src + x]);
                                 if(val > max_val)
                                 {
-                                    max_val = val;
+                                    max_val   = val;
+                                    max_ker_index = pool_size_x * (kh) + (kw);
                                     if(data_layout == DataLayout::NCHW)
                                     {
                                         max_index = coord2index(src.shape(), Coordinates(x, y, r, 0));
@@ -113,7 +120,7 @@ SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const Pooling
                         dst[b * z_dst * h_dst * w_dst + r * h_dst * w_dst + h * w_dst + w] = static_cast<T>(max_val);
                         if(indices)
                         {
-                            (*indices)[b * z_dst * h_dst * w_dst + r * h_dst * w_dst + h * w_dst + w] = max_index;
+                            (*indices)[b * z_dst * h_dst * w_dst + r * h_dst * w_dst + h * w_dst + w] = (info.use_kernel_indices) ? max_ker_index : max_index;
                         }
                     }
                 }
diff --git a/tests/validation/reference/QuantizationLayer.cpp b/tests/validation/reference/QuantizationLayer.cpp
index 27665375c3..ad7ba7ac43 100644
--- a/tests/validation/reference/QuantizationLayer.cpp
+++ b/tests/validation/reference/QuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/reference/ROIPoolingLayer.cpp b/tests/validation/reference/ROIPoolingLayer.cpp
new file mode 100644
index 0000000000..8dc3014763
--- /dev/null
+++ b/tests/validation/reference/ROIPoolingLayer.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ROIPoolingLayer.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/validation/Helpers.h"
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <>
+SimpleTensor<float> roi_pool_layer(const SimpleTensor<float> &src, const SimpleTensor<uint16_t> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo)
+{
+    ARM_COMPUTE_UNUSED(output_qinfo);
+
+    const size_t num_rois         = rois.shape()[1];
+    const size_t values_per_roi   = rois.shape()[0];
+    DataType     output_data_type = src.data_type();
+
+    TensorShape         input_shape = src.shape();
+    TensorShape         output_shape(pool_info.pooled_width(), pool_info.pooled_height(), src.shape()[2], num_rois);
+    SimpleTensor<float> output(output_shape, output_data_type);
+
+    const int   pooled_w      = pool_info.pooled_width();
+    const int   pooled_h      = pool_info.pooled_height();
+    const float spatial_scale = pool_info.spatial_scale();
+
+    // get sizes of x and y dimensions in src tensor
+    const int width  = src.shape()[0];
+    const int height = src.shape()[1];
+
+    // Move pointer across the fourth dimension
+    const size_t input_stride_w  = input_shape[0] * input_shape[1] * input_shape[2];
+    const size_t output_stride_w = output_shape[0] * output_shape[1] * output_shape[2];
+
+    const auto *rois_ptr = reinterpret_cast<const uint16_t *>(rois.data());
+
+    // Iterate through pixel width (X-Axis)
+    for(size_t pw = 0; pw < num_rois; ++pw)
+    {
+        const unsigned int roi_batch = rois_ptr[values_per_roi * pw];
+        const auto         x1        = rois_ptr[values_per_roi * pw + 1];
+        const auto         y1        = rois_ptr[values_per_roi * pw + 2];
+        const auto         x2        = rois_ptr[values_per_roi * pw + 3];
+        const auto         y2        = rois_ptr[values_per_roi * pw + 4];
+
+        //Iterate through pixel height (Y-Axis)
+        for(size_t fm = 0; fm < input_shape[2]; ++fm)
+        {
+            // Iterate through regions of interest index
+            for(size_t py = 0; py < pool_info.pooled_height(); ++py)
+            {
+                // Scale ROI
+                const int roi_anchor_x = support::cpp11::round(x1 * spatial_scale);
+                const int roi_anchor_y = support::cpp11::round(y1 * spatial_scale);
+                const int roi_width    = std::max(support::cpp11::round((x2 - x1) * spatial_scale), 1.f);
+                const int roi_height   = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f);
+
+                // Iterate over feature map (Z axis)
+                for(size_t px = 0; px < pool_info.pooled_width(); ++px)
+                {
+                    auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width));
+                    auto region_end_x   = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
+                    auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
+                    auto region_end_y   = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
+
+                    region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width);
+                    region_end_x   = std::min(std::max(region_end_x + roi_anchor_x, 0), width);
+                    region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height);
+                    region_end_y   = std::min(std::max(region_end_y + roi_anchor_y, 0), height);
+
+                    // Iterate through the pooling region
+                    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+                    {
+                        /* Assign element in tensor 'output' at coordinates px, py, fm, roi_indx, to 0 */
+                        auto out_ptr = output.data() + px + py * output_shape[0] + fm * output_shape[0] * output_shape[1] + pw * output_stride_w;
+                        *out_ptr     = 0;
+                    }
+                    else
+                    {
+                        float curr_max = -std::numeric_limits<float>::max();
+                        for(int j = region_start_y; j < region_end_y; ++j)
+                        {
+                            for(int i = region_start_x; i < region_end_x; ++i)
+                            {
+                                /* Retrieve element from input tensor at coordinates(i, j, fm, roi_batch) */
+                                float in_element = *(src.data() + i + j * input_shape[0] + fm * input_shape[0] * input_shape[1] + roi_batch * input_stride_w);
+                                curr_max         = std::max(in_element, curr_max);
+                            }
+                        }
+
+                        /* Assign element in tensor 'output' at coordinates px, py, fm, roi_indx, to curr_max */
+                        auto out_ptr = output.data() + px + py * output_shape[0] + fm * output_shape[0] * output_shape[1] + pw * output_stride_w;
+                        *out_ptr     = curr_max;
+                    }
+                }
+            }
+        }
+    }
+
+    return output;
+}
+
+/*
+    Template genericised method to allow calling of roi_pooling_layer with quantized 8 bit datatype
+*/
+template <>
+SimpleTensor<uint8_t> roi_pool_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint16_t> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo)
+{
+    const SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float>       dst_tmp = roi_pool_layer<float>(src_tmp, rois, pool_info, output_qinfo);
+    SimpleTensor<uint8_t>     dst     = convert_to_asymmetric<uint8_t>(dst_tmp, output_qinfo);
+    return dst;
+}
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/tests/validation/reference/FastCorners.h b/tests/validation/reference/ROIPoolingLayer.h
index 2c4506de5c..ddbaee2d5e 100644
--- a/tests/validation/reference/FastCorners.h
+++ b/tests/validation/reference/ROIPoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_FAST_CORNERS_H
-#define ARM_COMPUTE_TEST_FAST_CORNERS_H
+#ifndef ARM_COMPUTE_TEST_ROIPOOLLAYER_H
+#define ARM_COMPUTE_TEST_ROIPOOLLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -36,9 +37,10 @@ namespace validation
 namespace reference
 {
 template <typename T>
-std::vector<KeyPoint> fast_corners(const SimpleTensor<T> &src, float input_thresh, bool suppress_nonmax, BorderMode border_mode, T constant_border_value = 0);
+SimpleTensor<T> roi_pool_layer(const SimpleTensor<T> &src, const SimpleTensor<uint16_t> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_FAST_CORNERS_H */
+
+#endif /* ARM_COMPUTE_TEST_ROIPOOLLAYER_H */
+\ No newline at end of file
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index ffb79f86c5..c189bc2d47 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "ReductionOperation.h"
-
 #include "tests/validation/Helpers.h"
 
 #include <algorithm>
@@ -39,7 +38,7 @@ namespace reference
 namespace
 {
 template <typename T, typename OT>
-OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, int stride)
+OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, int stride, RoundingPolicy policy)
 {
     using type = typename std::remove_cv<OT>::type;
     T res;
@@ -99,7 +98,14 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in
         }
         if(op == ReductionOperation::MEAN_SUM && reduce_elements > 0)
         {
-            int_res /= reduce_elements;
+            // Only use rounding in aarch64 to be consistent with kernel
+#ifdef __aarch64__
+            // Divide in float format, then rounded to nearest and implicitly cast back to int
+            int_res = round(static_cast<float>(int_res) / static_cast<float>(reduce_elements), policy);
+#else  // defined(__aarch64__)
+            ARM_COMPUTE_UNUSED(policy);
+            int_res /= reduce_elements; // Legacy compatibility
+#endif // __aarch64
         }
         res = static_cast<type>(int_res);
     }
@@ -175,12 +181,12 @@ OT reduce_operation_arg_min_max(const T *ptr, int reduce_elements, ReductionOper
 } // namespace
 
 template <typename T, typename OT>
-SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
+SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                             DataType output_type, RoundingPolicy policy)
 {
     // Create reference
-    const bool         is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
-    DataType           output_data_type = is_arg_min_max ? DataType::S32 : src.data_type();
-    SimpleTensor<OT>   dst{ dst_shape, output_data_type, 1, src.quantization_info() };
+    const bool         is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+    SimpleTensor<OT>   dst{ dst_shape, output_type, 1, src.quantization_info() };
     const unsigned int src_width    = src.shape().x();
     const unsigned int src_height   = src.shape().y();
     const unsigned int src_depth    = src.shape().z();
@@ -197,7 +203,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
                 const T *src_row_ptr = src.data() + du * reduce_elems;
                 dst[du]              = is_arg_min_max ?
                                        reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, 1) :
-                                       reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, 1);
+                                       reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, 1, policy);
             }
         }
         break;
@@ -213,7 +219,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
                     const T *src_row_ptr = src.data() + in_offset;
                     dst[out_offset]       = is_arg_min_max ?
                                             reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width) :
-                                            reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width);
+                                            reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width, policy);
                 }
             }
         }
@@ -232,7 +238,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
                         const T *src_row_ptr = src.data() + in_offset;
                         dst[out_offset]       = is_arg_min_max ?
                                                 reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height) :
-                                                reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height);
+                                                reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height, policy);
                     }
                 }
             }
@@ -254,7 +260,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
                             const T *src_row_ptr = src.data() + in_offset;
                             dst[out_offset]       = is_arg_min_max ?
                                                     reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth) :
-                                                    reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
+                                                    reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth, policy);
                         }
                     }
                 }
@@ -269,74 +275,89 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
 }
 
 template <typename T, typename OT>
-SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                     DataType output_type, QuantizationInfo quantization_info_output, RoundingPolicy policy)
 {
     ARM_COMPUTE_UNUSED(quantization_info_output);
-    return compute_reduction_operation<T, OT>(src, dst_shape, axis, op);
+    return compute_reduction_operation<T, OT>(src, dst_shape, axis, op, output_type, policy);
 }
 
 template <>
-SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
+SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                          DataType output_type, QuantizationInfo quantization_info_output, RoundingPolicy policy)
 {
     if(src.data_type() == DataType::QASYMM8)
     {
         // If the operation is MEAN_SUM, we can directly use the uint8 implementation without taking into account scale and offset
         if(op == ReductionOperation::MEAN_SUM && src.quantization_info() == quantization_info_output)
         {
-            return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op);
+            return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op, output_type, policy);
         }
         else
         {
             SimpleTensor<float> src_f = convert_from_asymmetric(src);
-            SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op);
+            SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op, output_type);
             return convert_to_asymmetric<uint8_t>(dst_f, quantization_info_output);
         }
     }
     else
     {
-        return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op);
+        return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op, output_type, policy);
     }
 }
 
 template <>
-SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
+SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis,
+                                         ReductionOperation op, DataType output_type, QuantizationInfo quantization_info_output, RoundingPolicy policy)
 {
     if(src.data_type() == DataType::QASYMM8_SIGNED)
     {
         // If the operation is MEAN_SUM, we can directly use the int8 implementation without taking into account scale and offset
         if(op == ReductionOperation::MEAN_SUM && src.quantization_info() == quantization_info_output)
         {
-            return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op);
+            return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op, output_type, policy);
         }
         else
         {
             SimpleTensor<float> src_f = convert_from_asymmetric(src);
-            SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op);
+            SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op, output_type);
             return convert_to_asymmetric<int8_t>(dst_f, quantization_info_output);
         }
     }
     else
     {
-        return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op);
+        return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op, output_type, policy);
     }
 }
 
 template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                 QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                 DataType output_type = DataType::S32, QuantizationInfo quantization_info_output = QuantizationInfo(),
+                                                 RoundingPolicy policy = RoundingPolicy::TO_ZERO);
+
 template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                DataType         output_type              = DataType::S32,
+                                                QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   DataType         output_type              = DataType::S32,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
+
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int32_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   DataType         output_type              = DataType::S32,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   DataType         output_type              = DataType::S32,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   DataType         output_type              = DataType::S32,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   DataType         output_type              = DataType::S32,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 
+template SimpleTensor<int64_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                                   DataType output_type = DataType::S32, QuantizationInfo quantization_info_output = QuantizationInfo(),
+                                                   RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h
index 9c9e721b29..fb2e7a7093 100644
--- a/tests/validation/reference/ReductionOperation.h
+++ b/tests/validation/reference/ReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_TEST_REDUCTION_OPERATION_H
 #define ARM_COMPUTE_TEST_REDUCTION_OPERATION_H
 
+#include "arm_compute/core/Rounding.h"
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
 
@@ -36,8 +37,8 @@ namespace validation
 namespace reference
 {
 template <typename T, typename OT>
-SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                     QuantizationInfo quantization_info_output = QuantizationInfo());
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, DataType output_type = DataType::S32,
+                                     QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Remap.cpp b/tests/validation/reference/Remap.cpp
deleted file mode 100644
index 5c4d3c186b..0000000000
--- a/tests/validation/reference/Remap.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "Remap.h"
-
-#include "Utils.h"
-#include "tests/validation/Helpers.h"
-
-#include <algorithm>
-#include <array>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> remap(const SimpleTensor<T> &in, SimpleTensor<float> &map_x, SimpleTensor<float> &map_y, SimpleTensor<T> &valid_mask, InterpolationPolicy policy, BorderMode border_mode,
-                      T constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(border_mode == BorderMode::REPLICATE, "BorderMode not supported");
-    SimpleTensor<T> out(in.shape(), in.data_type());
-    ARM_COMPUTE_ERROR_ON(out.num_elements() != map_x.num_elements());
-    const int      width        = in.shape().x();
-    const int      height       = in.shape().y();
-    const uint32_t num_elements = out.num_elements();
-    for(uint32_t idx = 0; idx < num_elements; idx++)
-    {
-        const Coordinates id_out = index2coord(out.shape(), idx);
-        valid_mask[idx]          = 1;
-        Coordinates src_idx      = id_out; // need to setup all coordinates and not just xy
-        src_idx.set(0, static_cast<int>(std::floor(map_x[idx])));
-        src_idx.set(1, static_cast<int>(std::floor(map_y[idx])));
-        if((0 <= map_y[idx]) && (map_y[idx] < height) && (0 <= map_x[idx]) && (map_x[idx] < width))
-        {
-            switch(policy)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                {
-                    out[idx] = tensor_elem_at(in, src_idx, border_mode, constant_border_value);
-                    break;
-                }
-                case InterpolationPolicy::BILINEAR:
-                {
-                    (valid_bilinear_policy(map_x[idx], map_y[idx], width, height, border_mode)) ?
-                    out[idx]        = bilinear_policy(in, src_idx, map_x[idx], map_y[idx], border_mode, constant_border_value) :
-                                      valid_mask[idx] = 0;
-                    break;
-                }
-                case InterpolationPolicy::AREA:
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-                    break;
-            }
-        }
-        else
-        {
-            if(border_mode == BorderMode::UNDEFINED)
-            {
-                valid_mask[idx] = 0;
-            }
-            else
-            {
-                switch(policy)
-                {
-                    case InterpolationPolicy::NEAREST_NEIGHBOR:
-                        out[idx] = constant_border_value;
-                        break;
-                    case InterpolationPolicy::BILINEAR:
-                        out[idx] = bilinear_policy(in, src_idx, map_x[idx], map_y[idx], border_mode, constant_border_value);
-                        break;
-                    case InterpolationPolicy::AREA:
-                    default:
-                        break;
-                }
-            }
-        }
-    }
-
-    return out;
-}
-
-template SimpleTensor<uint8_t> remap(const SimpleTensor<uint8_t> &src, SimpleTensor<float> &map_x, SimpleTensor<float> &map_y, SimpleTensor<uint8_t> &valid_mask, InterpolationPolicy policy,
-                                     BorderMode border_mode,
-                                     uint8_t    constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Remap.h b/tests/validation/reference/Remap.h
deleted file mode 100644
index 0726f75965..0000000000
--- a/tests/validation/reference/Remap.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_REMAP_H
-#define ARM_COMPUTE_TEST_REMAP_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> remap(const SimpleTensor<T> &in, SimpleTensor<float> &map_x, SimpleTensor<float> &map_y, SimpleTensor<T> &valid_mask, InterpolationPolicy policy, BorderMode border_mode,
-                      T constant_border_value = 0);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REMAP_H */
diff --git a/tests/validation/reference/Reorder.cpp b/tests/validation/reference/Reorder.cpp
new file mode 100644
index 0000000000..8abb372596
--- /dev/null
+++ b/tests/validation/reference/Reorder.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Reorder.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+
+/*
+ * Generic transform.
+ *
+ * Assuming the untransposed case, this works by first reading <BlockBy>
+ * consecutive values from the first input row.  This same number of values
+ * are then read from the next <IntBy-1> rows.  Now return to the first
+ * input row and repeat.
+ *
+ * Need to cope with the work requested in either dimension not actually
+ * being a multiple of the block sizes.
+ */
+template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, typename d_type, arm_gemm::VLType vlt>
+struct Transform_ref
+{
+    template <typename TOut, typename TIn>
+    static void Transform(TOut &out, const TIn in, const int stride,
+                          const int y0, const int ymax, const int x0, const int xmax)
+    {
+        // NOTE: This code is disabled to avoid the call to get_vector_length(), so templated transforms will not be
+        // correct for SVE.  This is not an issue as we have specializations for all SVE cases.
+        // For SVE cases we multiply the interleave factor by the vector length.
+        // const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length<TOut>() / BlockBy : 1);
+        const unsigned int IntBy     = tIntBy;
+        int                out_index = 0;
+
+        const int n_whole_y_blocks = (ymax - y0) / IntBy;
+        const int y_remainders     = (ymax - y0) % IntBy;
+        const int n_y_blocks       = n_whole_y_blocks + (y_remainders ? 1 : 0);
+
+        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
+        const int x_remainders     = (xmax - x0) % BlockBy;
+        const int n_x_blocks       = n_whole_x_blocks + (x_remainders ? 1 : 0);
+
+        // "Y" loop: advance down the rows of the source IntBy rows at a time.
+        // Set up fill_rows to show the number rows to copy from, and blank_rows
+        // for the number of blank rows to add.
+        for(int y_block = 0; y_block < n_y_blocks; y_block++)
+        {
+            const int fill_rows  = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+            const int blank_rows = IntBy - fill_rows;
+
+            const int y_base = y0 + (y_block * IntBy);
+
+            // So now advance along this block of rows, BlockBy columns at a time.
+            for(int x_block = 0; x_block < n_x_blocks; x_block++)
+            {
+                const int fill_cols  = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+                const int blank_cols = BlockBy - fill_cols;
+
+                const int x_base = x0 + (x_block * BlockBy);
+
+                for(int row = 0; row < fill_rows; row++)
+                {
+                    for(int col = 0; col < fill_cols; col++)
+                    {
+                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
+                        if(Transposed)
+                        {
+                            out[out_index] = in[(x_base + col) * stride + y_base + row];
+                            out_index++;
+                        }
+                        else
+                        {
+                            out[out_index] = in[(y_base + row) * stride + x_base + col];
+                            out_index++;
+                        }
+                    }
+                    // "col" tail - row is in range but column is out of range.
+                    for(int col = 0; col < blank_cols; col++)
+                    {
+                        out[out_index] = 0;
+                        out_index++;
+                    }
+                }
+                // "row" tail - row is out of range so fill with zeros always.
+                const d_type zeroval = 0;
+                const int    pads    = blank_rows * (fill_cols + blank_cols);
+
+                for(int i = 0; i < pads; i++)
+                {
+                    out[out_index] = zeroval;
+                }
+
+                out_index += pads;
+            }
+        }
+    }
+};
+
+template <typename T>
+SimpleTensor<T> reorder_layer(const SimpleTensor<T> &src, const TensorShape &output_shape, WeightFormat output_wf)
+{
+    SimpleTensor<T> dst{ output_shape, src.data_type() };
+    const int       cols = src.shape()[0];
+    const int       rows = src.shape()[1];
+
+    switch(output_wf)
+    {
+        case WeightFormat::OHWIo4:
+        {
+            Transform_ref<4, 1, true, sizeof(float), sizeof(float), float, arm_gemm::VLType::None>::Transform<SimpleTensor<T> &, SimpleTensor<T>>(dst, src, rows, 0, rows, 0, cols);
+            break;
+        }
+        case WeightFormat::OHWIo8:
+        {
+            Transform_ref<8, 1, true, sizeof(float), sizeof(float), float, arm_gemm::VLType::None>::Transform<SimpleTensor<T> &, SimpleTensor<T>>(dst, src, rows, 0, rows, 0, cols);
+            break;
+        }
+        default:
+            break;
+    }
+
+    return dst;
+}
+
+template SimpleTensor<float> reorder_layer(const SimpleTensor<float> &src, const TensorShape &output_shape, WeightFormat output_wf);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/Dilate.h b/tests/validation/reference/Reorder.h
index 640bc9dd73..94ee5078f8 100644
--- a/tests/validation/reference/Dilate.h
+++ b/tests/validation/reference/Reorder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_DILATE_H
-#define ARM_COMPUTE_TEST_DILATE_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_REORDER
+#define ACL_TESTS_VALIDATION_REFERENCE_REORDER
 
 #include "tests/SimpleTensor.h"
+#include "tests/Types.h"
 
 namespace arm_compute
 {
@@ -35,9 +36,9 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> dilate(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value = 0);
+SimpleTensor<T> reorder_layer(const SimpleTensor<T> &src, const TensorShape &output_shape, WeightFormat output_wf);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DILATE_H */
+#endif /* ACL_TESTS_VALIDATION_REFERENCE_REORDER */
diff --git a/tests/validation/reference/ReshapeLayer.cpp b/tests/validation/reference/ReshapeLayer.cpp
index daea001be6..30a58dd65b 100644
--- a/tests/validation/reference/ReshapeLayer.cpp
+++ b/tests/validation/reference/ReshapeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,14 +44,15 @@ SimpleTensor<T> reshape_layer(const SimpleTensor<T> &src, const TensorShape &out
     return dst;
 }
 
-template SimpleTensor<uint8_t> reshape_layer(const SimpleTensor<uint8_t> &src, const TensorShape &output_shape);
-template SimpleTensor<int8_t> reshape_layer(const SimpleTensor<int8_t> &src, const TensorShape &output_shape);
+template SimpleTensor<uint8_t>  reshape_layer(const SimpleTensor<uint8_t> &src, const TensorShape &output_shape);
+template SimpleTensor<int8_t>   reshape_layer(const SimpleTensor<int8_t> &src, const TensorShape &output_shape);
 template SimpleTensor<uint16_t> reshape_layer(const SimpleTensor<uint16_t> &src, const TensorShape &output_shape);
-template SimpleTensor<int16_t> reshape_layer(const SimpleTensor<int16_t> &src, const TensorShape &output_shape);
+template SimpleTensor<int16_t>  reshape_layer(const SimpleTensor<int16_t> &src, const TensorShape &output_shape);
 template SimpleTensor<uint32_t> reshape_layer(const SimpleTensor<uint32_t> &src, const TensorShape &output_shape);
-template SimpleTensor<int32_t> reshape_layer(const SimpleTensor<int32_t> &src, const TensorShape &output_shape);
-template SimpleTensor<half> reshape_layer(const SimpleTensor<half> &src, const TensorShape &output_shape);
-template SimpleTensor<float> reshape_layer(const SimpleTensor<float> &src, const TensorShape &output_shape);
+template SimpleTensor<int32_t>  reshape_layer(const SimpleTensor<int32_t> &src, const TensorShape &output_shape);
+template SimpleTensor<half>     reshape_layer(const SimpleTensor<half> &src, const TensorShape &output_shape);
+template SimpleTensor<float>    reshape_layer(const SimpleTensor<float> &src, const TensorShape &output_shape);
+template SimpleTensor<bfloat16> reshape_layer(const SimpleTensor<bfloat16> &src, const TensorShape &output_shape);
 /** [ReshapeLayer] **/
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/Reverse.cpp b/tests/validation/reference/Reverse.cpp
index c6c4614278..7924f900d1 100644
--- a/tests/validation/reference/Reverse.cpp
+++ b/tests/validation/reference/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,9 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t> &axis)
+SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis)
 {
+    ARM_COMPUTE_ERROR_ON(src.shape().num_dimensions() > 4);
     ARM_COMPUTE_ERROR_ON(axis.shape().num_dimensions() > 1);
     ARM_COMPUTE_ERROR_ON(axis.shape().x() > 4);
 
@@ -48,10 +49,32 @@ SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t>
     const unsigned int depth   = src.shape()[2];
     const unsigned int batches = src.shape()[3];
 
+    const int rank = src.shape().num_dimensions();
+
     std::array<bool, 4> to_reverse = { { false, false, false, false } };
     for(int i = 0; i < axis.num_elements(); ++i)
     {
-        to_reverse[axis[i]] = true;
+        int axis_i = axis[i];
+
+        // The values of axis tensor must be between [-rank, rank-1].
+        if((axis_i < -rank) || (axis_i >= rank))
+        {
+            ARM_COMPUTE_ERROR("the values of the axis tensor must be within [-rank, rank-1].");
+        }
+
+        // In case of negative axis value i.e targeted axis(i) = rank + axis(i)
+        if(axis_i < 0)
+        {
+            axis_i = rank + axis_i;
+        }
+
+        // Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis
+        if(use_inverted_axis)
+        {
+            axis_i = (rank - 1) - axis_i;
+        }
+
+        to_reverse[axis_i] = true;
     }
 
     const uint32_t num_elements = src.num_elements();
@@ -73,9 +96,9 @@ SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t>
     return dst;
 }
 
-template SimpleTensor<uint8_t> reverse(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint32_t> &axis);
-template SimpleTensor<half> reverse(const SimpleTensor<half> &src, const SimpleTensor<uint32_t> &axis);
-template SimpleTensor<float> reverse(const SimpleTensor<float> &src, const SimpleTensor<uint32_t> &axis);
+template SimpleTensor<uint8_t> reverse(const SimpleTensor<uint8_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<half> reverse(const SimpleTensor<half> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<float> reverse(const SimpleTensor<float> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Reverse.h b/tests/validation/reference/Reverse.h
index 4a28da7270..30926b05a5 100644
--- a/tests/validation/reference/Reverse.h
+++ b/tests/validation/reference/Reverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_REVERSE_H
-#define ARM_COMPUTE_TEST_REVERSE_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_REVERSE_H
+#define ACL_TESTS_VALIDATION_REFERENCE_REVERSE_H
 
 #include "tests/SimpleTensor.h"
 
@@ -35,9 +35,9 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t> &axis);
+SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis = false);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REVERSE_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_REVERSE_H
diff --git a/tests/validation/reference/Scale.cpp b/tests/validation/reference/Scale.cpp
index 71e98fd776..2f429cb29b 100644
--- a/tests/validation/reference/Scale.cpp
+++ b/tests/validation/reference/Scale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #include "Scale.h"
 
 #include "Utils.h"
-#include "arm_compute/core/utils/misc/Utility.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -183,14 +182,15 @@ SimpleTensor<T> scale_core(const SimpleTensor<T> &in, float scale_x, float scale
 
 template <typename T>
 SimpleTensor<T> scale(const SimpleTensor<T> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value,
-                      SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners)
+                      SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info)
 {
+    ARM_COMPUTE_UNUSED(output_quantization_info);
     return scale_core<T>(src, scale_x, scale_y, policy, border_mode, constant_border_value, sampling_policy, ceil_policy_scale, align_corners);
 }
 
 template <>
 SimpleTensor<uint8_t> scale(const SimpleTensor<uint8_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value,
-                            SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners)
+                            SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info)
 {
     SimpleTensor<uint8_t> dst;
     if(src.quantization_info().uniform().scale != 0.f)
@@ -198,7 +198,7 @@ SimpleTensor<uint8_t> scale(const SimpleTensor<uint8_t> &src, float scale_x, flo
         SimpleTensor<float> src_tmp                 = convert_from_asymmetric(src);
         float               constant_border_value_f = dequantize_qasymm8(constant_border_value, src.quantization_info());
         SimpleTensor<float> dst_tmp                 = scale_core<float>(src_tmp, scale_x, scale_y, policy, border_mode, constant_border_value_f, sampling_policy, ceil_policy_scale, align_corners);
-        dst                                         = convert_to_asymmetric<uint8_t>(dst_tmp, src.quantization_info());
+        dst                                         = convert_to_asymmetric<uint8_t>(dst_tmp, output_quantization_info);
     }
     else
     {
@@ -209,7 +209,7 @@ SimpleTensor<uint8_t> scale(const SimpleTensor<uint8_t> &src, float scale_x, flo
 
 template <>
 SimpleTensor<int8_t> scale(const SimpleTensor<int8_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, int8_t constant_border_value,
-                           SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners)
+                           SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info)
 {
     SimpleTensor<int8_t> dst;
     if(src.quantization_info().uniform().scale != 0.f)
@@ -217,7 +217,7 @@ SimpleTensor<int8_t> scale(const SimpleTensor<int8_t> &src, float scale_x, float
         SimpleTensor<float> src_tmp                 = convert_from_asymmetric(src);
         float               constant_border_value_f = dequantize_qasymm8_signed(constant_border_value, src.quantization_info());
         SimpleTensor<float> dst_tmp                 = scale_core<float>(src_tmp, scale_x, scale_y, policy, border_mode, constant_border_value_f, sampling_policy, ceil_policy_scale, align_corners);
-        dst                                         = convert_to_asymmetric<int8_t>(dst_tmp, src.quantization_info());
+        dst                                         = convert_to_asymmetric<int8_t>(dst_tmp, output_quantization_info);
     }
     else
     {
@@ -227,11 +227,11 @@ SimpleTensor<int8_t> scale(const SimpleTensor<int8_t> &src, float scale_x, float
 }
 
 template SimpleTensor<int16_t> scale(const SimpleTensor<int16_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, int16_t constant_border_value,
-                                     SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners);
+                                     SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info);
 template SimpleTensor<half> scale(const SimpleTensor<half> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, half constant_border_value,
-                                  SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners);
+                                  SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info);
 template SimpleTensor<float> scale(const SimpleTensor<float> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, float constant_border_value,
-                                   SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners);
+                                   SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Scale.h b/tests/validation/reference/Scale.h
index c66af8d94e..c32c07d1c0 100644
--- a/tests/validation/reference/Scale.h
+++ b/tests/validation/reference/Scale.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,7 @@ namespace reference
 {
 template <typename T>
 SimpleTensor<T> scale(const SimpleTensor<T> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value = 0,
-                      SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool ceil_policy_scale = false, bool align_corners = false);
+                      SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool ceil_policy_scale = false, bool align_corners = false, QuantizationInfo output_quantization_info = QuantizationInfo());
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ScatterLayer.cpp b/tests/validation/reference/ScatterLayer.cpp
new file mode 100644
index 0000000000..55c48a9002
--- /dev/null
+++ b/tests/validation/reference/ScatterLayer.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ScatterLayer.h"
+#include "tests/validation/Helpers.h"
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+
+template <typename T>
+T reduce_op(const T &current,const T &update,const ScatterFunction func)
+{
+    switch(func)
+    {
+        case ScatterFunction::Update:
+            return update;
+            break;
+        case ScatterFunction::Add:
+            return current + update;
+            break;
+        case ScatterFunction::Sub:
+            return current - update;
+            break;
+        case ScatterFunction::Max:
+            return std::max(current, update);
+            break;
+        case ScatterFunction::Min:
+            return std::min(current, update);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported Scatter function");
+            break;
+    }
+}
+
+template float reduce_op(const float &current,const float &update,const ScatterFunction func);
+template half reduce_op(const half &current,const half &update,const ScatterFunction func);
+}
+
+// NOTE: This function expects collapsed tensors as input.
+// Batch dims for update/indices tensors should be collapsed into a single dim.
+// Data dims should be collapsed into a single dim for both update and src tensors prior to calling this function.
+template <typename T>
+SimpleTensor<T> scatter_layer_internal(const SimpleTensor<T> &src, const SimpleTensor<T> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info)
+{
+    // 1. If zero initialization variable is false, copy src data to dst.
+    SimpleTensor<T> dst{ out_shape, src.data_type(), 1 };
+    if(!info.zero_initialization)
+    {
+        std::copy_n(src.data(), src.num_elements(), dst.data());
+    }
+
+    // Number of elements between each value of the dim being iterated through
+    const unsigned int data_stride = updates.shape().total_size_lower(updates.shape().num_dimensions() - 1);
+    const unsigned int no_output_dims = out_shape.num_dimensions();
+
+    // Calculate output stride at given index for all output dims.
+    std::vector<unsigned int> out_stride_at_idx(no_output_dims);
+    for (unsigned int i = 0 ; i < no_output_dims; i++)
+    {
+        out_stride_at_idx[i] = out_shape.total_size_lower(i);
+    }
+
+    const unsigned int indices_x_dim = static_cast<unsigned int>(indices.shape()[0]);
+    const unsigned int indices_y_dim = static_cast<unsigned int>(indices.shape()[1]);
+
+    // 2. Iterate over indices tensor y-dim and replace sections of dst tensor with relevant areas of update tensor.
+    for(unsigned int i = 0; i < indices_y_dim; i++)
+    {
+        // NOTE : Currently, indices.shape() == [X, Y, 1, 1], where  X is the indices dim and Y is the batch dim
+        // Starting index for both the update and indices tensors.
+        const unsigned int update_dim_start = i * data_stride;
+        const unsigned int indices_dim_start = i * indices_x_dim;
+        bool out_of_bounds = false;
+        unsigned int out_offset_acc = 0;
+
+        // Iterate over each indices value for the relevant batch and accumulate the offset.
+        for(unsigned int j = 0; j < indices_x_dim; j++)
+        {
+            // Get first index value with i * indices_x_dim (iterating through y-dim/batch idx), then iterate through x dim by adding k
+            const int index_value = indices[indices_dim_start + j];
+            const unsigned int out_dim = no_output_dims - (j+1);   // Calculate corresponding output dim to current index value.
+            if(index_value < static_cast<int>(out_shape[out_dim]) && index_value >= 0)
+            {
+                out_offset_acc += (index_value * out_stride_at_idx[out_dim]); // offset accumulation
+            }
+            else
+            {
+                out_of_bounds = true;
+                break;
+            }
+        }
+
+        // If not out of bounds, copy update tensor elements to output
+        if(!out_of_bounds)
+        {
+            for (unsigned int j = 0 ; j < data_stride; j++)
+            {
+                dst[out_offset_acc + j] = reduce_op(dst[out_offset_acc + j], updates[update_dim_start + j], info.func);
+            }
+        }
+    }
+    return dst;
+}
+
+template <typename T>
+SimpleTensor<T> scatter_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info)
+{
+    return scatter_layer_internal<T>(src, updates, indices, out_shape, info);
+}
+
+template SimpleTensor<float> scatter_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<half> scatter_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int32_t> scatter_layer(const SimpleTensor<int32_t> &src, const SimpleTensor<int32_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint32_t> scatter_layer(const SimpleTensor<uint32_t> &src, const SimpleTensor<uint32_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int16_t> scatter_layer(const SimpleTensor<int16_t> &src, const SimpleTensor<int16_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint16_t> scatter_layer(const SimpleTensor<uint16_t> &src, const SimpleTensor<uint16_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int8_t> scatter_layer(const SimpleTensor<int8_t> &src, const SimpleTensor<int8_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint8_t> scatter_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/HOGDescriptor.h b/tests/validation/reference/ScatterLayer.h
index dffeb655b2..97d5e70b0d 100644
--- a/tests/validation/reference/HOGDescriptor.h
+++ b/tests/validation/reference/ScatterLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_HOG_DESCRIPTOR_H
-#define ARM_COMPUTE_TEST_HOG_DESCRIPTOR_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_SCATTERLAYER_H
+#define ACL_TESTS_VALIDATION_REFERENCE_SCATTERLAYER_H
 
+#include "Utils.h"
+#include "arm_compute/function_info/ScatterInfo.h"
 #include "tests/SimpleTensor.h"
 
 namespace arm_compute
@@ -34,16 +36,13 @@ namespace validation
 {
 namespace reference
 {
-template <typename T, typename U, typename V>
-void hog_orientation_binning(const SimpleTensor<T> &mag, const SimpleTensor<U> &phase, SimpleTensor<V> &hog_space, const HOGInfo &hog_info);
-
 template <typename T>
-void hog_block_normalization(SimpleTensor<T> &desc, const SimpleTensor<T> &hog_space, const HOGInfo &hog_info);
+SimpleTensor<T> scatter_layer_internal(const SimpleTensor<T> &src, const SimpleTensor<T> &update, const SimpleTensor<int32_t> &indices, const TensorShape &shape, const ScatterInfo &info);
 
-template <typename T, typename U>
-SimpleTensor<T> hog_descriptor(const SimpleTensor<U> &src, BorderMode border_mode, U constant_border_value, const HOGInfo &hog_info);
+template <typename T>
+SimpleTensor<T> scatter_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &update, const SimpleTensor<int32_t> &indices, const TensorShape &shape, const ScatterInfo &info);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_HOG_DESCRIPTOR_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_SCATTERLAYER_H
diff --git a/tests/validation/reference/Scharr.cpp b/tests/validation/reference/Scharr.cpp
deleted file mode 100644
index e9fbb73d49..0000000000
--- a/tests/validation/reference/Scharr.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "Scharr.h"
-
-#include "Utils.h"
-#include "tests/validation/Helpers.h"
-
-#include <array>
-#include <map>
-#include <utility>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-const std::array<int8_t, 9> scharr_3_x{ { -3, 0, 3, -10, 0, 10, -3, 0, 3 } };
-const std::array<int8_t, 9> scharr_3_y{ { -3, -10, -3, 0, 0, 0, 3, 10, 3 } };
-
-const std::map<int, std::pair<const int8_t *, const int8_t *>> masks
-{
-    { 3, { scharr_3_x.data(), scharr_3_y.data() } }
-};
-
-template <typename T>
-struct data_type;
-
-template <>
-struct data_type<int16_t>
-{
-    const static DataType value = DataType::S16;
-};
-} // namespace
-
-template <typename T, typename U>
-std::pair<SimpleTensor<T>, SimpleTensor<T>> scharr(const SimpleTensor<U> &src, int filter_size, BorderMode border_mode, uint8_t constant_border_value, GradientDimension gradient_dimension)
-{
-    const auto shape_size = static_cast<unsigned int>(filter_size);
-
-    SimpleTensor<T> dst_x(src.shape(), data_type<T>::value, src.num_channels());
-    SimpleTensor<T> dst_y(src.shape(), data_type<T>::value, src.num_channels());
-
-    ValidRegion valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(filter_size / 2));
-
-    const uint32_t num_elements = src.num_elements();
-    for(uint32_t i = 0; i < num_elements; ++i)
-    {
-        Coordinates coord = index2coord(src.shape(), i);
-
-        if(!is_in_valid_region(valid_region, coord))
-        {
-            continue;
-        }
-
-        switch(gradient_dimension)
-        {
-            case GradientDimension::GRAD_X:
-                apply_2d_spatial_filter(coord, src, dst_x, TensorShape{ shape_size, shape_size }, masks.at(filter_size).first, 1.f, border_mode, constant_border_value);
-                break;
-            case GradientDimension::GRAD_Y:
-                apply_2d_spatial_filter(coord, src, dst_y, TensorShape{ shape_size, shape_size }, masks.at(filter_size).second, 1.f, border_mode, constant_border_value);
-                break;
-            case GradientDimension::GRAD_XY:
-                apply_2d_spatial_filter(coord, src, dst_x, TensorShape{ shape_size, shape_size }, masks.at(filter_size).first, 1.f, border_mode, constant_border_value);
-                apply_2d_spatial_filter(coord, src, dst_y, TensorShape{ shape_size, shape_size }, masks.at(filter_size).second, 1.f, border_mode, constant_border_value);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Gradient dimension not supported");
-        }
-    }
-
-    return std::make_pair(dst_x, dst_y);
-}
-
-template std::pair<SimpleTensor<int16_t>, SimpleTensor<int16_t>> scharr(const SimpleTensor<uint8_t> &src, int filter_size, BorderMode border_mode, uint8_t constant_border_value,
-                                                                        GradientDimension gradient_dimension);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Scharr.h b/tests/validation/reference/Scharr.h
deleted file mode 100644
index 42b3202d64..0000000000
--- a/tests/validation/reference/Scharr.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_SCHARR_H
-#define ARM_COMPUTE_TEST_SCHARR_H
-
-#include "tests/SimpleTensor.h"
-#include "tests/Types.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename U>
-std::pair<SimpleTensor<T>, SimpleTensor<T>> scharr(const SimpleTensor<U> &src, int filter_size, BorderMode border_mode, uint8_t constant_border_value, GradientDimension gradient_dimension);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SCHARR_H */
diff --git a/tests/validation/reference/Sobel.cpp b/tests/validation/reference/Sobel.cpp
deleted file mode 100644
index d9c2532add..0000000000
--- a/tests/validation/reference/Sobel.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "Sobel.h"
-
-#include "Utils.h"
-#include "tests/validation/Helpers.h"
-
-#include <array>
-#include <map>
-#include <utility>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-const std::array<int8_t, 9> sobel_3_x{ { -1, 0, 1, -2, 0, 2, -1, 0, 1 } };
-const std::array<int8_t, 9> sobel_3_y{ { -1, -2, -1, 0, 0, 0, 1, 2, 1 } };
-
-const std::array<int8_t, 25> sobel_5_x{ {
-        -1, -2, 0, 2, 1,
-        -4, -8, 0, 8, 4,
-        -6, -12, 0, 12, 6,
-        -4, -8, 0, 8, 4,
-        -1, -2, 0, 2, 1
-    } };
-
-const std::array<int8_t, 25> sobel_5_y{ {
-        -1, -4, -6, -4, -1,
-        -2, -8, -12, -8, -2,
-        0, 0, 0, 0, 0,
-        2, 8, 12, 8, 2,
-        1, 4, 6, 4, 1
-    } };
-
-const std::array<int8_t, 49> sobel_7_x{ {
-        -1, -4, -5, 0, 5, 4, 1,
-        -6, -24, -30, 0, 30, 24, 6,
-        -15, -60, -75, 0, 75, 60, 15,
-        -20, -80, -100, 0, 100, 80, 20,
-        -15, -60, -75, 0, 75, 60, 15,
-        -6, -24, -30, 0, 30, 24, 6,
-        -1, -4, -5, 0, 5, 4, 1
-    } };
-
-const std::array<int8_t, 49> sobel_7_y{ {
-        -1, -6, -15, -20, -15, -6, -1,
-        -4, -24, -60, -80, -60, -24, -4,
-        -5, -30, -75, -100, -75, -30, -5,
-        0, 0, 0, 0, 0, 0, 0,
-        5, 30, 75, 100, 75, 30, 5,
-        4, 24, 60, 80, 60, 24, 4,
-        1, 6, 15, 20, 15, 6, 1
-    } };
-
-const std::map<int, std::pair<const int8_t *, const int8_t *>> masks
-{
-    { 3, { sobel_3_x.data(), sobel_3_y.data() } },
-    { 5, { sobel_5_x.data(), sobel_5_y.data() } },
-    { 7, { sobel_7_x.data(), sobel_7_y.data() } },
-};
-
-template <typename T>
-struct data_type;
-
-template <>
-struct data_type<int16_t>
-{
-    const static DataType value = DataType::S16;
-};
-
-template <>
-struct data_type<int>
-{
-    const static DataType value = DataType::S32;
-};
-} // namespace
-
-template <typename T, typename U>
-std::pair<SimpleTensor<T>, SimpleTensor<T>> sobel(const SimpleTensor<U> &src, int filter_size, BorderMode border_mode, uint8_t constant_border_value, GradientDimension gradient_dimension)
-{
-    SimpleTensor<T> dst_x(src.shape(), data_type<T>::value, src.num_channels());
-    SimpleTensor<T> dst_y(src.shape(), data_type<T>::value, src.num_channels());
-
-    ValidRegion valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(filter_size / 2));
-
-    const uint32_t num_elements = src.num_elements();
-    for(uint32_t i = 0; i < num_elements; ++i)
-    {
-        Coordinates coord = index2coord(src.shape(), i);
-
-        if(!is_in_valid_region(valid_region, coord))
-        {
-            continue;
-        }
-        switch(gradient_dimension)
-        {
-            case GradientDimension::GRAD_X:
-                apply_2d_spatial_filter(coord, src, dst_x, TensorShape{ static_cast<unsigned int>(filter_size), static_cast<unsigned int>(filter_size) }, masks.at(filter_size).first, 1.f, border_mode,
-                                        constant_border_value);
-                break;
-            case GradientDimension::GRAD_Y:
-                apply_2d_spatial_filter(coord, src, dst_y, TensorShape{ static_cast<unsigned int>(filter_size), static_cast<unsigned int>(filter_size) }, masks.at(filter_size).second, 1.f, border_mode,
-                                        constant_border_value);
-                break;
-            case GradientDimension::GRAD_XY:
-                apply_2d_spatial_filter(coord, src, dst_x, TensorShape{ static_cast<unsigned int>(filter_size), static_cast<unsigned int>(filter_size) }, masks.at(filter_size).first, 1.f, border_mode,
-                                        constant_border_value);
-                apply_2d_spatial_filter(coord, src, dst_y, TensorShape{ static_cast<unsigned int>(filter_size), static_cast<unsigned int>(filter_size) }, masks.at(filter_size).second, 1.f, border_mode,
-                                        constant_border_value);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Gradient dimension not supported");
-        }
-    }
-
-    return std::make_pair(dst_x, dst_y);
-}
-
-template std::pair<SimpleTensor<int16_t>, SimpleTensor<int16_t>> sobel(const SimpleTensor<uint8_t> &src, int filter_size, BorderMode border_mode, uint8_t constant_border_value,
-                                                                       GradientDimension gradient_dimension);
-template std::pair<SimpleTensor<int>, SimpleTensor<int>> sobel(const SimpleTensor<uint8_t> &src, int filter_size, BorderMode border_mode, uint8_t constant_border_value,
-                                                               GradientDimension gradient_dimension);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Sobel.h b/tests/validation/reference/Sobel.h
deleted file mode 100644
index 86d6d0bc11..0000000000
--- a/tests/validation/reference/Sobel.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_SOBEL_H
-#define ARM_COMPUTE_TEST_SOBEL_H
-
-#include "arm_compute/core/Types.h"
-#include "tests/SimpleTensor.h"
-#include "tests/Types.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename U>
-std::pair<SimpleTensor<T>, SimpleTensor<T>> sobel(const SimpleTensor<U> &src, int filter_size, BorderMode border_mode, uint8_t constant_border_value, GradientDimension gradient_dimension);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SOBEL_H */
diff --git a/tests/validation/reference/Threshold.cpp b/tests/validation/reference/Threshold.cpp
deleted file mode 100644
index 6bc6cf0b4a..0000000000
--- a/tests/validation/reference/Threshold.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal src the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included src all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. src NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER src AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * dst OF OR src CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS src THE
- * SOFTWARE.
- */
-#include "Threshold.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> threshold(const SimpleTensor<T> &src, T threshold, T false_value, T true_value, ThresholdType type, T upper)
-{
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-
-    switch(type)
-    {
-        case ThresholdType::BINARY:
-            for(int i = 0; i < src.num_elements(); ++i)
-            {
-                dst[i] = ((src[i] > threshold) ? true_value : false_value);
-            }
-            break;
-        case ThresholdType::RANGE:
-            for(int i = 0; i < src.num_elements(); ++i)
-            {
-                if(src[i] > upper)
-                {
-                    dst[i] = false_value;
-                }
-                else if(src[i] < threshold)
-                {
-                    dst[i] = false_value;
-                }
-                else
-                {
-                    dst[i] = true_value;
-                }
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Thresholding type not recognised");
-            break;
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint8_t> threshold(const SimpleTensor<uint8_t> &src, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Threshold.h b/tests/validation/reference/Threshold.h
deleted file mode 100644
index bee9531351..0000000000
--- a/tests/validation/reference/Threshold.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_THRESHOLD_H
-#define ARM_COMPUTE_TEST_THRESHOLD_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> threshold(const SimpleTensor<T> &src, T threshold, T false_value, T true_value, ThresholdType type, T upper);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_THRESHOLD_H */
diff --git a/tests/validation/reference/Utils.h b/tests/validation/reference/Utils.h
index 8e15faab8f..c83c6ea4b3 100644
--- a/tests/validation/reference/Utils.h
+++ b/tests/validation/reference/Utils.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/Types.h"
 #include "tests/Globals.h"
-#include "tests/ILutAccessor.h"
 #include "tests/Types.h"
 
 #include <array>
@@ -123,26 +122,6 @@ void apply_2d_spatial_filter(Coordinates coord, const SimpleTensor<T> &src, Simp
 
 RawTensor transpose(const RawTensor &src, int chunk_width = 1);
 
-/** Fill matrix random.
- *
- * @param[in,out] matrix Matrix
- */
-template <std::size_t SIZE>
-inline void fill_warp_matrix(std::array<float, SIZE> &matrix)
-{
-    std::mt19937                          gen(library.get()->seed());
-    std::uniform_real_distribution<float> dist(-1, 1);
-    for(auto &x : matrix)
-    {
-        x = dist(gen);
-    }
-    if(SIZE == 9)
-    {
-        // This is only used in Warp Perspective, we set M[3][3] = 1 so that Z0 is not 0 and we avoid division by 0.
-        matrix[8] = 1.f;
-    }
-}
-
 bool valid_bilinear_policy(float xn, float yn, int width, int height, BorderMode border_mode);
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/UtilsQuantizedAsymm.h b/tests/validation/reference/UtilsQuantizedAsymm.h
index 25873acc93..e5ecc66545 100644
--- a/tests/validation/reference/UtilsQuantizedAsymm.h
+++ b/tests/validation/reference/UtilsQuantizedAsymm.h
@@ -32,6 +32,22 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+#if __clang__
+// This has been tested on clang 7.0.2 (__clang_major__ == 7 && __clang_minor__ == 0 && __clang_patchlevel__ == 2)
+inline int64_t to_int64(int32_t val)
+{
+    return static_cast<int64_t>(val) | ((val < 0) ? (((1ll << 32) - 1) << 32) : 0);
+}
+#else  // __clang__
+inline int64_t to_int64(int32_t val)
+{
+    return static_cast<int64_t>(val);
+}
+#endif // __clang__
+} // namespace
+
 /** Rounded to nearest division by a power-of-two. */
 inline int32_t asymm_rounding_divide_by_pow2(int32_t x, int exponent)
 {
@@ -40,15 +56,15 @@ inline int32_t asymm_rounding_divide_by_pow2(int32_t x, int exponent)
     return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
 }
 
-/** Multiplication of two integers. The same as ARMv7 Neon VQRDMULH instruction. */
+/** Multiplication of two integers. The same as ARMv7 Arm® Neon™ VQRDMULH instruction. */
 inline int32_t asymm_int_mult(int32_t a, int32_t b)
 {
-    bool    overflow = a == b && a == std::numeric_limits<int32_t>::min();
-    int64_t a_64(a);
-    int64_t b_64(b);
-    int64_t ab_64        = a_64 * b_64;
-    int32_t nudge        = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
-    int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
+    const bool    overflow     = a == b && a == std::numeric_limits<int32_t>::min();
+    const int64_t a_64         = to_int64(a);
+    const int64_t b_64         = to_int64(b);
+    const int64_t ab_64        = a_64 * b_64;
+    const int32_t nudge        = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+    const int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
     return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
 }
 
diff --git a/tests/validation/reference/WarpAffine.cpp b/tests/validation/reference/WarpAffine.cpp
deleted file mode 100644
index 3580b75d43..0000000000
--- a/tests/validation/reference/WarpAffine.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "WarpAffine.h"
-
-#include "Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-bool valid_bilinear_policy(float xn, float yn, int width, int height, BorderMode border_mode)
-{
-    if(border_mode != BorderMode::UNDEFINED)
-    {
-        return true;
-    }
-    if((0 <= yn + 1) && (yn + 1 < height) && (0 <= xn + 1) && (xn + 1 < width))
-    {
-        return true;
-    }
-    return false;
-}
-
-template <typename T>
-SimpleTensor<T> warp_affine(const SimpleTensor<T> &src, SimpleTensor<T> &valid_mask, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-
-    // x0 = M00 * x + M01 * y + M02
-    // y0 = M10 * x + M11 * y + M12
-    const float M00 = matrix[0];
-    const float M10 = matrix[1];
-    const float M01 = matrix[0 + 1 * 2];
-    const float M11 = matrix[1 + 1 * 2];
-    const float M02 = matrix[0 + 2 * 2];
-    const float M12 = matrix[1 + 2 * 2];
-
-    const int width  = src.shape().x();
-    const int height = src.shape().y();
-
-    const uint32_t num_elements = src.num_elements();
-    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
-    {
-        valid_mask[element_idx] = 1;
-        Coordinates id          = index2coord(src.shape(), element_idx);
-        int         idx         = id.x();
-        int         idy         = id.y();
-
-        float x0 = M00 * idx + M01 * idy + M02;
-        float y0 = M10 * idx + M11 * idy + M12;
-
-        id.set(0, static_cast<int>(std::floor(x0)));
-        id.set(1, static_cast<int>(std::floor(y0)));
-        if((0 <= y0) && (y0 < height) && (0 <= x0) && (x0 < width))
-        {
-            switch(policy)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    dst[element_idx] = tensor_elem_at(src, id, border_mode, constant_border_value);
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                    (valid_bilinear_policy(x0, y0, width, height, border_mode)) ? dst[element_idx] = bilinear_policy(src, id, x0, y0, border_mode, constant_border_value) :
-                                                                                                     valid_mask[element_idx] = 0;
-                    break;
-                case InterpolationPolicy::AREA:
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-        else
-        {
-            if(border_mode == BorderMode::UNDEFINED)
-            {
-                valid_mask[element_idx] = 0;
-            }
-            else
-            {
-                switch(policy)
-                {
-                    case InterpolationPolicy::NEAREST_NEIGHBOR:
-                        if(border_mode == BorderMode::CONSTANT)
-                        {
-                            dst[element_idx] = constant_border_value;
-                        }
-                        else if(border_mode == BorderMode::REPLICATE)
-                        {
-                            id.set(0, std::max(0, std::min(static_cast<int>(x0), width - 1)));
-                            id.set(1, std::max(0, std::min(static_cast<int>(y0), height - 1)));
-                            dst[element_idx] = src[coord2index(src.shape(), id)];
-                        }
-                        break;
-                    case InterpolationPolicy::BILINEAR:
-                        dst[element_idx] = bilinear_policy(src, id, x0, y0, border_mode, constant_border_value);
-                        break;
-                    case InterpolationPolicy::AREA:
-                    default:
-                        ARM_COMPUTE_ERROR("Interpolation not supported");
-                }
-            }
-        }
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint8_t> warp_affine(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &valid_mask, const float *matrix, InterpolationPolicy policy, BorderMode border_mode,
-                                           uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/tests/validation/reference/WarpAffine.h b/tests/validation/reference/WarpAffine.h
deleted file mode 100644
index 90f765c283..0000000000
--- a/tests/validation/reference/WarpAffine.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_WARP_AFFINE_H
-#define ARM_COMPUTE_TEST_WARP_AFFINE_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> warp_affine(const SimpleTensor<T> &src, SimpleTensor<T> &valid_mask, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_WARP_AFFINE_H */
diff --git a/tests/validation/reference/WarpPerspective.cpp b/tests/validation/reference/WarpPerspective.cpp
deleted file mode 100644
index e35d75e6e2..0000000000
--- a/tests/validation/reference/WarpPerspective.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-
-#include "Utils.h"
-#include "WarpPerspective.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> warp_perspective(const SimpleTensor<T> &src, SimpleTensor<T> &valid_mask, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-
-    // x0 = M00 * x + M01 * y + M02
-    // y0 = M10 * x + M11 * y + M12
-    // z0 = M20 * x + M21 * y + M22
-    // xn = x0 / z0
-    // yn = y0 / z0
-    const float M00 = matrix[0];
-    const float M10 = matrix[1];
-    const float M20 = matrix[2];
-    const float M01 = matrix[0 + 1 * 3];
-    const float M11 = matrix[1 + 1 * 3];
-    const float M21 = matrix[2 + 1 * 3];
-    const float M02 = matrix[0 + 2 * 3];
-    const float M12 = matrix[1 + 2 * 3];
-    const float M22 = matrix[2 + 2 * 3];
-
-    const int width  = src.shape().x();
-    const int height = src.shape().y();
-
-    const uint32_t num_elements = src.num_elements();
-    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
-    {
-        valid_mask[element_idx] = 1;
-        Coordinates id          = index2coord(src.shape(), element_idx);
-        const int   idx         = id.x();
-        const int   idy         = id.y();
-        const float z0          = M20 * idx + M21 * idy + M22;
-
-        const float x0 = (M00 * idx + M01 * idy + M02);
-        const float y0 = (M10 * idx + M11 * idy + M12);
-
-        const float xn = x0 / z0;
-        const float yn = y0 / z0;
-        id.set(0, static_cast<int>(std::floor(xn)));
-        id.set(1, static_cast<int>(std::floor(yn)));
-        if((0 <= yn) && (yn < height) && (0 <= xn) && (xn < width))
-        {
-            switch(policy)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    dst[element_idx] = tensor_elem_at(src, id, border_mode, constant_border_value);
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                    (valid_bilinear_policy(xn, yn, width, height, border_mode)) ? dst[element_idx] = bilinear_policy(src, id, xn, yn, border_mode, constant_border_value) : valid_mask[element_idx] = 0;
-                    break;
-                case InterpolationPolicy::AREA:
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-                    break;
-            }
-        }
-        else
-        {
-            if(border_mode == BorderMode::UNDEFINED)
-            {
-                valid_mask[element_idx] = 0;
-            }
-            else
-            {
-                switch(policy)
-                {
-                    case InterpolationPolicy::NEAREST_NEIGHBOR:
-                        if(border_mode == BorderMode::CONSTANT)
-                        {
-                            dst[element_idx] = constant_border_value;
-                        }
-                        else if(border_mode == BorderMode::REPLICATE)
-                        {
-                            id.set(0, std::max(0, std::min(static_cast<int>(xn), width - 1)));
-                            id.set(1, std::max(0, std::min(static_cast<int>(yn), height - 1)));
-                            dst[element_idx] = src[coord2index(src.shape(), id)];
-                        }
-                        break;
-                    case InterpolationPolicy::BILINEAR:
-                        dst[element_idx] = bilinear_policy(src, id, xn, yn, border_mode, constant_border_value);
-                        break;
-                    case InterpolationPolicy::AREA:
-                    default:
-                        ARM_COMPUTE_ERROR("Interpolation not supported");
-                        break;
-                }
-            }
-        }
-    }
-    return dst;
-}
-
-template SimpleTensor<uint8_t> warp_perspective(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &valid_mask, const float *matrix, InterpolationPolicy policy, BorderMode border_mode,
-                                                uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/WarpPerspective.h b/tests/validation/reference/WarpPerspective.h
deleted file mode 100644
index 7fcd5ddf7b..0000000000
--- a/tests/validation/reference/WarpPerspective.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_WARP_PERSPECTIVE_H
-#define ARM_COMPUTE_TEST_WARP_PERSPECTIVE_H
-
-#include "tests/SimpleTensor.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> warp_perspective(const SimpleTensor<T> &src, SimpleTensor<T> &valid_mask, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_WARP_PERSPECTIVE_H */
diff --git a/tests/validation/reference/YOLOLayer.cpp b/tests/validation/reference/YOLOLayer.cpp
deleted file mode 100644
index fbc81f1af9..0000000000
--- a/tests/validation/reference/YOLOLayer.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "YOLOLayer.h"
-
-#include "ActivationLayer.h"
-
-#include "arm_compute/core/Types.h"
-#include "tests/validation/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> yolo_layer(const SimpleTensor<T> &src, const ActivationLayerInfo &info, int32_t num_classes)
-{
-    // Create reference
-    SimpleTensor<T> dst{ src.shape(), src.data_type() };
-
-    // Compute reference
-    const T a(info.a());
-    const T b(info.b());
-
-    const uint32_t num_elements = src.num_elements();
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(uint32_t i = 0; i < num_elements; ++i)
-    {
-        const size_t z = index2coord(dst.shape(), i).z() % (num_classes + 5);
-
-        if(z != 2 && z != 3)
-        {
-            dst[i] = activate_float<T>(src[i], a, b, info.activation());
-        }
-        else
-        {
-            dst[i] = src[i];
-        }
-    }
-
-    return dst;
-}
-
-template <>
-SimpleTensor<uint8_t> yolo_layer<uint8_t>(const SimpleTensor<uint8_t> &src, const ActivationLayerInfo &info, int32_t num_classes)
-{
-    SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float>   dst_tmp = yolo_layer<float>(src_tmp, info, num_classes);
-    SimpleTensor<uint8_t> dst     = convert_to_asymmetric<uint8_t>(dst_tmp, src.quantization_info());
-    return dst;
-}
-
-template SimpleTensor<float> yolo_layer(const SimpleTensor<float> &src, const ActivationLayerInfo &info, int32_t num_classes);
-template SimpleTensor<half> yolo_layer(const SimpleTensor<half> &src, const ActivationLayerInfo &info, int32_t num_classes);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute