16 files changed, 4064 insertions, 0 deletions
diff --git a/tests/validation/dynamic_fusion/Utils.h b/tests/validation/dynamic_fusion/Utils.h
new file mode 100644
index 0000000000..72e9ec5955
--- /dev/null
+++ b/tests/validation/dynamic_fusion/Utils.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef TESTS_VALIDATION_DYNAMIC_FUSION_UTILS
+#define TESTS_VALIDATION_DYNAMIC_FUSION_UTILS
+
+#include "tests/AssetsLibrary.h"
+#include "utils/Utils.h"
+
+#include <chrono>
+#include <limits>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace utils
+{
+/** A pair of macros which measures the wall clock time, and records it into a map measurement_map with name clock_name
+ *
+ */
+#define TICK(clock_name) \
+    auto clock_name##_tick = std::chrono::high_resolution_clock::now();
+#define TOCK(clock_name, measurement_map)                                               \
+    auto clock_name##_tock                 = std::chrono::high_resolution_clock::now(); \
+    measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
+#define TOCK_AVG(clock_name, measurement_map, num_iterations)                           \
+    auto clock_name##_tock                 = std::chrono::high_resolution_clock::now(); \
+    measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
+
+template <typename T, typename U>
+void fill(U &&tensor, int seed, AssetsLibrary *library)
+{
+    static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
+    using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+
+    DistributionType distribution{ T(-1.0f), T(1.0f) };
+    library->fill(tensor, distribution, seed);
+
+    // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
+    DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
+    library->fill_borders_with_garbage(tensor, distribution_inf, seed);
+}
+} // namespace utils
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif /* TESTS_VALIDATION_DYNAMIC_FUSION_UTILS */
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
new file mode 100644
index 0000000000..453983c077
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -0,0 +1,642 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/dynamic_fusion/Utils.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/DepthConvertLayer.h"
+#include "tests/validation/reference/DepthwiseConvolutionLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/PixelWiseMultiplication.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+using namespace arm_compute::test::validation::utils;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(INTEGRATION)
+TEST_SUITE(DYNAMIC_FUSION)
+
+TEST_CASE(Conv2d, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = conv2d1x1(direct_conv)(input, weights, bias)
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto data_layout    = DataLayout::NHWC;
+    const auto t_input_shape  = TensorShape(384, 12, 12);
+    const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+    const auto t_dst_shape    = TensorShape(16, 12, 12);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    // Fuse conv2d
+    Conv2dAttributes conv2d_attr{};
+    ITensorInfo     *input_info  = context.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+    ITensorInfo     *weight_info = context.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+
+    ITensorInfo *conv_out_info = GpuConv2d::create_op(sketch, input_info, weight_info, nullptr, conv2d_attr);
+
+    ITensorInfo *dst_info = context.create_tensor_info();
+    GpuOutput::create_op(sketch, conv_out_info, dst_info);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for (auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = std::get<0>(data);
+        TensorInfo    info        = std::get<1>(data);
+        AuxMemoryInfo aux_mem_req = std::get<2>(data);
+        tensor->allocator()->init(info, aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_input{};
+    CLTensor t_weight{};
+    CLTensor t_dst{};
+
+    // Initialize user tensors
+    t_input.allocator()->init(*input_info);
+    t_weight.allocator()->init(*weight_info);
+    t_dst.allocator()->init(*dst_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_input.allocator()->allocate();
+    t_weight.allocator()->allocate();
+    t_dst.allocator()->allocate();
+    fill<float>(CLAccessor(t_input), 0, library.get());
+    fill<float>(CLAccessor(t_weight), 1, library.get());
+
+    // Run runtime
+    runtime.run({&t_input, &t_weight, &t_dst});
+
+    // Create reference
+    SimpleTensor<float> ref_t_input{t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC};
+    SimpleTensor<float> ref_t_weight{t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC};
+    SimpleTensor<float> ref_t_bias_placeholder{t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC};
+
+    // Fill reference
+    fill<float>(ref_t_input, 0, library.get());
+    fill<float>(ref_t_weight, 1, library.get());
+
+    auto ref_t_input_nchw            = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
+    auto ref_t_weight_nchw           = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U));
+    auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U));
+    auto t_dst_shape_nchw            = t_dst_shape;
+    permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+    PadStrideInfo legacy_pad_stride(conv2d_attr.stride().x(), conv2d_attr.stride().y(), conv2d_attr.pad().left,
+                                    conv2d_attr.pad().right, conv2d_attr.pad().top, conv2d_attr.pad().bottom,
+                                    DimensionRoundingType{});
+    auto ref_t_dst_nchw = reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw,
+                                                       t_dst_shape_nchw, legacy_pad_stride, conv2d_attr.dilation());
+    const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
+
+    RelativeTolerance<float> tolerance_f32(
+        0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
+}
+
+TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     *   out_0 = in_0 + in_1
+     *   out_1 = out_0 + in_2
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type     = DataType::F32;
+    const auto t_input_shape = TensorShape(33, 3, 2);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    ITensorInfo *in_0_info = context.create_tensor_info(t_input_shape, 1, data_type);
+    ITensorInfo *in_1_info = context.create_tensor_info(t_input_shape, 1, data_type);
+    ITensorInfo *in_2_info = context.create_tensor_info(t_input_shape, 1, data_type);
+
+    ITensorInfo *out_0_info = context.create_tensor_info();
+    ITensorInfo *out_1_info = context.create_tensor_info();
+
+    ITensorInfo *ans_0_info = GpuAdd::create_op(sketch, in_0_info, in_1_info);
+    GpuOutput::create_op(sketch, ans_0_info, out_0_info);
+    ITensorInfo *ans_1_info = GpuAdd::create_op(sketch, ans_0_info, in_2_info);
+    GpuOutput::create_op(sketch, ans_1_info, out_1_info);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for (auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = std::get<0>(data);
+        TensorInfo    info        = std::get<1>(data);
+        AuxMemoryInfo aux_mem_req = std::get<2>(data);
+        tensor->allocator()->init(info, aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_in_0{};
+    CLTensor t_in_1{};
+    CLTensor t_in_2{};
+
+    CLTensor t_out_0{};
+    CLTensor t_out_1{};
+
+    // Initialize user tensors
+    t_in_0.allocator()->init(*in_0_info);
+    t_in_1.allocator()->init(*in_1_info);
+    t_in_2.allocator()->init(*in_2_info);
+
+    t_out_0.allocator()->init(*out_0_info);
+    t_out_1.allocator()->init(*out_1_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_in_0.allocator()->allocate();
+    t_in_1.allocator()->allocate();
+    t_in_2.allocator()->allocate();
+
+    t_out_0.allocator()->allocate();
+    t_out_1.allocator()->allocate();
+
+    fill<float>(CLAccessor(t_in_0), 0, library.get());
+    fill<float>(CLAccessor(t_in_1), 1, library.get());
+    fill<float>(CLAccessor(t_in_2), 2, library.get());
+
+    // Run runtime
+    runtime.run({&t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1});
+
+    // Create reference
+    SimpleTensor<float> ref_t_in_0{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_in_1{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_in_2{t_input_shape, data_type, 1, QuantizationInfo()};
+
+    SimpleTensor<float> ref_t_out_0{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_out_1{t_input_shape, data_type, 1, QuantizationInfo()};
+
+    // Fill reference
+    fill<float>(ref_t_in_0, 0, library.get());
+    fill<float>(ref_t_in_1, 1, library.get());
+    fill<float>(ref_t_in_2, 2, library.get());
+
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP);
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_out_1,
+                                    ConvertPolicy::WRAP);
+
+    RelativeTolerance<float> tolerance_f32(
+        0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32);
+    validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32);
+}
+TEST_CASE(Add_Output_Add_Cast_Cast_Output, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     *   out_0 = in_0 + in_1
+     *   out_1 = float(int32_t(out_0 + in_2))
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type     = DataType::F32;
+    const auto t_input_shape = TensorShape(3, 8, 5);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    ITensorInfo *in_0_info = context.create_tensor_info(t_input_shape, 1, data_type);
+    ITensorInfo *in_1_info = context.create_tensor_info(t_input_shape, 1, data_type);
+    ITensorInfo *in_2_info = context.create_tensor_info(t_input_shape, 1, data_type);
+
+    ITensorInfo *out_0_info = context.create_tensor_info();
+    ITensorInfo *out_1_info = context.create_tensor_info();
+
+    CastAttributes cast_0_attr;
+    cast_0_attr.data_type(DataType::F16);
+
+    CastAttributes cast_1_attr;
+    cast_1_attr.data_type(DataType::F32);
+
+    ITensorInfo *ans_0_info = GpuAdd::create_op(sketch, in_0_info, in_1_info);
+    GpuOutput::create_op(sketch, ans_0_info, out_0_info);
+    ITensorInfo *ans_1_info = GpuAdd::create_op(sketch, ans_0_info, in_2_info);
+    ITensorInfo *ans_2_info = GpuCast::create_op(sketch, ans_1_info, cast_0_attr);
+    ITensorInfo *ans_3_info = GpuCast::create_op(sketch, ans_2_info, cast_1_attr);
+    GpuOutput::create_op(sketch, ans_3_info, out_1_info);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for (auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = std::get<0>(data);
+        TensorInfo    info        = std::get<1>(data);
+        AuxMemoryInfo aux_mem_req = std::get<2>(data);
+        tensor->allocator()->init(info, aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_in_0{};
+    CLTensor t_in_1{};
+    CLTensor t_in_2{};
+
+    CLTensor t_out_0{};
+    CLTensor t_out_1{};
+
+    // Initialize user tensors
+    t_in_0.allocator()->init(*in_0_info);
+    t_in_1.allocator()->init(*in_1_info);
+    t_in_2.allocator()->init(*in_2_info);
+
+    t_out_0.allocator()->init(*out_0_info);
+    t_out_1.allocator()->init(*out_1_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_in_0.allocator()->allocate();
+    t_in_1.allocator()->allocate();
+    t_in_2.allocator()->allocate();
+
+    t_out_0.allocator()->allocate();
+    t_out_1.allocator()->allocate();
+
+    fill<float>(CLAccessor(t_in_0), 0, library.get());
+    fill<float>(CLAccessor(t_in_1), 1, library.get());
+    fill<float>(CLAccessor(t_in_2), 2, library.get());
+
+    // Run runtime
+    runtime.run({&t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1});
+
+    // Create reference
+    SimpleTensor<float> ref_t_in_0{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_in_1{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_in_2{t_input_shape, data_type, 1, QuantizationInfo()};
+
+    SimpleTensor<float> ref_t_out_0{t_input_shape, data_type, 1, QuantizationInfo()};
+    SimpleTensor<float> ref_t_ans_1{t_input_shape, data_type, 1, QuantizationInfo()};
+
+    // Fill reference
+    fill<float>(ref_t_in_0, 0, library.get());
+    fill<float>(ref_t_in_1, 1, library.get());
+    fill<float>(ref_t_in_2, 2, library.get());
+
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP);
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_ans_1,
+                                    ConvertPolicy::WRAP);
+    const auto ref_t_ans_2 =
+        reference::depth_convert<float, int32_t>(ref_t_ans_1, DataType::S32, ConvertPolicy::SATURATE, 0);
+    const auto ref_t_out_1 =
+        reference::depth_convert<int32_t, float>(ref_t_ans_2, DataType::F32, ConvertPolicy::SATURATE, 0);
+
+    RelativeTolerance<float> tolerance_add_f32(0.001f);
+    AbsoluteTolerance<float> tolerance_cast_f32(1.0f);
+    validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_add_f32);
+    validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_cast_f32);
+}
+
+/// TODO: COMPMID-6593 : This integration test fails with CKW backend.
+/// It was not enabled for CKW before, therefore went unnoticed.
+TEST_CASE(Conv2d_Sigmoid_DepthwiseConv2d_Mul, framework::DatasetMode::DISABLED)
+{
+    //   (tensor0)
+    //       |
+    // ======|============================================== Sketch 0
+    //       |     (tensor1)     +---- (tensor2)
+    //       |         |         |         |
+    // +-- input -- weights -- biases --+  |
+    // |                                |  |
+    // |            Conv2d              |  |
+    // |                                |  |
+    // +----------- output -------------+  |
+    //                |                    |
+    //          +-- input ---+             |
+    //          |            |             |
+    //          |  Sigmoid   |             |
+    //          |            |             |
+    //          +-- output --+             |
+    //                |                    |
+    //          +-- input ---+             |
+    //          |            |             |
+    //          |   Output   |             |
+    //          |            |             |
+    //          +-- output --+             |
+    //                |                    |
+    //            (tensor5)                |
+    //                |                    |
+    //       +--------+                    |
+    // ======|=============================|================ Sketch 1
+    //       |     (tensor3) (tensor4)     |
+    //       |         |         |         |
+    // +-- input -- weights -- biases --+  |
+    // |                                |  |
+    // |        DepthwiseConv2d         |  |
+    // |                                |  |
+    // +----------- output -------------+  |
+    //                |                    |
+    //             +--+   +----------------+
+    //             |      |
+    //        +-- lhs -- rhs --+
+    //        |                |
+    //        |    Multiply    |
+    //        |                |
+    //        +---- output ----+
+    //                |
+    //          +-- input ---+
+    //          |            |
+    //          |   Output   |
+    //          |            |
+    //          +-- output --+
+    //                |
+    //            (tensor6)
+
+    TensorShape conv2d_src_shape(10, 20, 30);
+    TensorShape conv2d_wei_shape(10, 3, 3, 5);
+    TensorShape conv2d_bia_shape(5);
+    TensorShape conv2d_dst_shape(5, 18, 28);
+    TensorShape dwc_wei_shape(5, 3, 3);
+    TensorShape dwc_bia_shape(5);
+    TensorShape dwc_dst_shape(5, 16, 26);
+
+    // Initialize the context.
+    CLScheduler::get().default_reinit();
+
+    auto               cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context(&cl_compile_ctx);
+
+    auto tensor0_info = context.create_tensor_info(conv2d_src_shape, 1, DataType::F32, DataLayout::NHWC);
+
+    // Create the first sketch: conv2d + cast + output.
+    GpuWorkloadSketch sketch0(&context);
+
+    Conv2dAttributes conv2d_attr;
+    auto             tensor1_info = context.create_tensor_info(conv2d_wei_shape, 1, DataType::F32, DataLayout::NHWC);
+    auto             tensor2_info = context.create_tensor_info(conv2d_bia_shape, 1, DataType::F32, DataLayout::NHWC);
+    ARM_COMPUTE_EXPECT(GpuConv2d::validate_op(sketch0, tensor0_info, tensor1_info, tensor2_info, conv2d_attr),
+                       framework::LogLevel::ERRORS);
+    auto ans_info = GpuConv2d::create_op(sketch0, tensor0_info, tensor1_info, tensor2_info, conv2d_attr);
+
+    ARM_COMPUTE_EXPECT(GpuSigmoid::validate_op(sketch0, ans_info), framework::LogLevel::ERRORS);
+    ans_info = GpuSigmoid::create_op(sketch0, ans_info);
+
+    DepthwiseConv2dAttributes dwc_attr;
+    auto tensor3_info = context.create_tensor_info(dwc_wei_shape, 1, DataType::F32, DataLayout::NHWC);
+    auto tensor4_info = context.create_tensor_info(dwc_bia_shape, 1, DataType::F32, DataLayout::NHWC);
+    ARM_COMPUTE_EXPECT(!GpuDepthwiseConv2d::validate_op(sketch0, ans_info, tensor3_info, tensor4_info, dwc_attr),
+                       framework::LogLevel::ERRORS);
+
+    auto tensor5_info = context.create_tensor_info();
+    ARM_COMPUTE_EXPECT(GpuOutput::validate_op(sketch0, ans_info, tensor5_info), framework::LogLevel::ERRORS);
+    GpuOutput::create_op(sketch0, ans_info, tensor5_info);
+
+    // Create the first workload runtime.
+    ClWorkloadRuntime runtime0;
+    runtime0.configure(sketch0);
+
+    // Create the second sketch: dwc + sigmoid + output.
+    GpuWorkloadSketch sketch1(&context);
+
+    ARM_COMPUTE_EXPECT(GpuDepthwiseConv2d::validate_op(sketch1, tensor5_info, tensor3_info, tensor4_info, dwc_attr),
+                       framework::LogLevel::ERRORS);
+    ans_info = GpuDepthwiseConv2d::create_op(sketch1, tensor5_info, tensor3_info, tensor4_info, dwc_attr);
+
+    ARM_COMPUTE_EXPECT(GpuMul::validate_op(sketch1, ans_info, tensor2_info), framework::LogLevel::ERRORS);
+    ans_info = GpuMul::create_op(sketch1, ans_info, tensor2_info);
+
+    auto tensor6_info = context.create_tensor_info();
+    ARM_COMPUTE_EXPECT(GpuOutput::validate_op(sketch1, ans_info, tensor6_info), framework::LogLevel::ERRORS);
+    GpuOutput::create_op(sketch1, ans_info, tensor6_info);
+
+    // Create the second workload runtime.
+    ClWorkloadRuntime runtime1;
+    runtime1.configure(sketch1);
+
+    // Create the user tensors.
+    CLTensor tensor0;
+    CLTensor tensor1;
+    CLTensor tensor2;
+    CLTensor tensor3;
+    CLTensor tensor4;
+    CLTensor tensor5;
+    CLTensor tensor6;
+
+    tensor0.allocator()->init(*tensor0_info);
+    tensor1.allocator()->init(*tensor1_info);
+    tensor2.allocator()->init(*tensor2_info);
+    tensor3.allocator()->init(*tensor3_info);
+    tensor4.allocator()->init(*tensor4_info);
+    tensor5.allocator()->init(*tensor5_info);
+    tensor6.allocator()->init(*tensor6_info);
+
+    tensor0.allocator()->allocate();
+    tensor1.allocator()->allocate();
+    tensor2.allocator()->allocate();
+    tensor3.allocator()->allocate();
+    tensor4.allocator()->allocate();
+    tensor5.allocator()->allocate();
+    tensor6.allocator()->allocate();
+
+    // Allocate the auxiliary tensors.
+    for (auto &data : runtime0.get_auxiliary_tensors())
+    {
+        auto  tensor      = std::get<0>(data);
+        auto &tensor_info = std::get<1>(data);
+        auto  mem_req     = std::get<2>(data);
+
+        tensor->allocator()->init(tensor_info, mem_req.alignment);
+        tensor->allocator()->allocate();
+    }
+
+    for (auto &data : runtime1.get_auxiliary_tensors())
+    {
+        auto  tensor      = std::get<0>(data);
+        auto &tensor_info = std::get<1>(data);
+        auto  mem_req     = std::get<2>(data);
+
+        tensor->allocator()->init(tensor_info, mem_req.alignment);
+        tensor->allocator()->allocate();
+    }
+
+    // Fill the input tensors with random data.
+    fill<float>(CLAccessor(tensor0), 0, library.get());
+    fill<float>(CLAccessor(tensor1), 1, library.get());
+    fill<float>(CLAccessor(tensor2), 2, library.get());
+    fill<float>(CLAccessor(tensor3), 3, library.get());
+    fill<float>(CLAccessor(tensor4), 4, library.get());
+
+    // Run each runtime.
+    runtime0.run({&tensor0, &tensor1, &tensor2, &tensor5});
+    runtime1.run({&tensor5, &tensor3, &tensor4, &tensor2, &tensor6});
+
+    // Compute the reference result.
+    SimpleTensor<float> ref_conv2d_src(conv2d_src_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC);
+    SimpleTensor<float> ref_conv2d_wei(conv2d_wei_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC);
+    SimpleTensor<float> ref_conv2d_bia(conv2d_bia_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC);
+    SimpleTensor<float> ref_dwc_wei(dwc_wei_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC);
+    SimpleTensor<float> ref_dwc_bia(dwc_bia_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC);
+
+    fill<float>(ref_conv2d_src, 0, library.get());
+    fill<float>(ref_conv2d_wei, 1, library.get());
+    fill<float>(ref_conv2d_bia, 2, library.get());
+    fill<float>(ref_dwc_wei, 3, library.get());
+    fill<float>(ref_dwc_bia, 4, library.get());
+
+    PermutationVector nhwc_to_nchw(1, 2, 0);
+
+    auto conv2d_dst_shape_nchw = conv2d_dst_shape;
+    permute(conv2d_dst_shape_nchw, nhwc_to_nchw);
+    const auto ref_conv2d_src_nchw = reference::permute(ref_conv2d_src, nhwc_to_nchw);
+    const auto ref_conv2d_wei_nchw = reference::permute(ref_conv2d_wei, nhwc_to_nchw);
+    const auto ref_conv2d_bia_nchw = reference::permute(ref_conv2d_bia, nhwc_to_nchw);
+    const auto ref_conv2d_dst_nchw = reference::convolution_layer(
+        ref_conv2d_src_nchw, ref_conv2d_wei_nchw, ref_conv2d_bia_nchw, conv2d_dst_shape_nchw, PadStrideInfo());
+
+    const auto ref_sigmoid_dst_nchw = reference::activation_layer(
+        ref_conv2d_dst_nchw, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+
+    auto dwc_dst_shape_nchw = dwc_dst_shape;
+    permute(dwc_dst_shape_nchw, nhwc_to_nchw);
+    const auto ref_dwc_wei_nchw = reference::permute(ref_dwc_wei, nhwc_to_nchw);
+    const auto ref_dwc_bia_nchw = reference::permute(ref_dwc_bia, nhwc_to_nchw);
+    const auto ref_dwc_dst_nchw = reference::depthwise_convolution(
+        ref_sigmoid_dst_nchw, ref_dwc_wei_nchw, ref_dwc_bia_nchw, dwc_dst_shape_nchw, PadStrideInfo(), 1);
+
+    const auto ref_mul_dst_nchw = reference::pixel_wise_multiplication<float, float, float>(
+        ref_dwc_dst_nchw, ref_conv2d_bia_nchw, 1.0, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_UP,
+        DataType::F32);
+
+    constexpr RelativeTolerance<float> tolerance(0.001f);
+    validate(CLAccessor(tensor6), ref_mul_dst_nchw, tolerance);
+}
+
+TEST_SUITE(Invalid_Fusion_Should_Fail)
+TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = conv2d(conv2d(l0_input, l0_weight), l1_weight)
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto data_layout    = DataLayout::NHWC;
+    const auto t_input_shape  = TensorShape(384, 12, 12);
+    const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+    auto       t_input_info   = TensorInfo(t_input_shape, 1, data_type, data_layout);
+    auto       t_weight_info  = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+    auto       t_dst_info     = TensorInfo();
+
+    Conv2dAttributes conv2d_attr{};
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    // Create tensor infos
+    ITensorInfo *input_info  = context.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+    ITensorInfo *weight_info = context.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+    ITensorInfo *dst_info;
+
+    // Fuse conv2d into the workload
+    {
+        // Validate operator
+        const Status success = GpuConv2d::validate_op(sketch, input_info, weight_info, nullptr, conv2d_attr);
+        ARM_COMPUTE_EXPECT(bool(success), framework::LogLevel::ERRORS);
+
+        dst_info = GpuConv2d::create_op(sketch, input_info, weight_info, nullptr, conv2d_attr);
+    }
+
+    // Create tensor infos
+    ITensorInfo *weight_info_2 = context.create_tensor_info(t_weight_info);
+
+    // Fuse conv2d into the workload
+    {
+        // Validate operator, should fail
+        const Status success          = GpuConv2d::validate_op(sketch, dst_info, weight_info_2, nullptr, conv2d_attr);
+        const auto expected_error_str = "Operator fusion test failed. This operator cannot be fused into the workload";
+
+        ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT((success.error_description().find(expected_error_str) != std::string::npos),
+                           framework::LogLevel::ERRORS);
+    }
+}
+TEST_SUITE_END() // Invalid_Fusion_Should_Fail
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // INTEGRATION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
new file mode 100644
index 0000000000..9bfdc961fe
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/DynamicFusionDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/* Synced with tests/validation/CL/ArithmeticAddition.cpp from the standard interface.
+ *
+ * Difference          | Why the difference
+ * No quantized tests  | Not supported yet
+ * No in place tests   | Not supported yet
+ * No activation tests | Not needed in dynamic fusion interface
+ *
+ */
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(ADD)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // S16 is valid data type for Add
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // S32 is valid data type for Add
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
+                                                        TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32),    // Broadcast Y dimension is not allowed
+                                                        TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::S16),    // Broadcast Z dimension is not allowed
+                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed
+                                                      }),
+               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for rhs
+                                                       TensorInfo(TensorShape(15U,  1U, 3U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape( 3U,  8U, 1U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("Expected", { true, false, true, true, false, true, false, false, true, false, false, true})),
+               input1_info, input2_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Validate Elementwise Add
+    auto          lhs_info         = context.create_tensor_info(input1_info);
+    auto          rhs_info         = context.create_tensor_info(input2_info);
+
+    bool res = bool(GpuAdd::validate_op(sketch, lhs_info, rhs_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+constexpr AbsoluteTolerance<float> tolerance_f(
+    0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 and DataType::F16 */
+constexpr float tolerance_num = 0.0001f; /**< Tolerance number */
+
+template <typename T>
+using DynamicFusionCLAddFixture =
+    DynamicFusionGpuElementwiseBinaryOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+
+template <typename T>
+using DynamicFusionCLAddBroadcastFixture =
+    DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+
+template <typename T>
+using DynamicFusionCLAddTwoOpsFixture =
+    DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLAddFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeOneOp,
+                       DynamicFusionCLAddFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLAddBroadcastFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLAddBroadcastFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::TemporaryLimitedLargeShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f);
+}
+FIXTURE_DATA_TEST_CASE(
+    RunSmallTwoOps,
+    DynamicFusionCLAddTwoOpsFixture<float>,
+    framework::DatasetMode::PRECOMMIT,
+    combine(combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                    datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes()),
+                            framework::dataset::make("DataType", {DataType::F32})),
+                    framework::dataset::make("InPlace", {false})),
+            framework::dataset::make("FuseTwoOps", {true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLAddFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLAddBroadcastFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f, tolerance_num);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLAddFixture<int32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::S32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLAddFixture<int16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::S16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionCLAddFixture<int16_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", {DataType::S16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLAddFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::ADD}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::U8})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
+
+TEST_SUITE_END() // ADD
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Cast.cpp b/tests/validation/dynamic_fusion/gpu/cl/Cast.cpp
new file mode 100644
index 0000000000..4ef359e74d
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Cast.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ConvertPolicyDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/CastFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+// Tolerance
+constexpr AbsoluteTolerance<float> zero_tolerance(0);
+
+/** Input data sets **/
+
+// F16
+const auto CastF16toF32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32));
+
+// F32
+const auto CastF32toF16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
+
+class DFConvertPolicies final : public framework::dataset::ContainerDataset<std::vector<ConvertPolicy>>
+{
+public:
+    DFConvertPolicies()
+        : ContainerDataset("ConvertPolicy",
+    {
+        ConvertPolicy::WRAP
+    })
+    {
+    }
+};
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(CAST)
+
+template <typename T>
+using DynamicFusionCLCastToF16Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, half>;
+template <typename T>
+using DynamicFusionCLCastToF32Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, float>;
+
+#define CAST_SUITE(NAME, idt, odt, type, dataset, tolerance)                                                                     \
+    TEST_SUITE(NAME)                                                                                                             \
+    FIXTURE_DATA_TEST_CASE(RunSmall, type, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), dataset), \
+                                                                                              DFConvertPolicies()))              \
+    {                                                                                                                            \
+        validate(CLAccessor(_target), _reference, tolerance);                                                                    \
+    }                                                                                                                            \
+    TEST_SUITE_END()
+
+// F16
+CAST_SUITE(F16_to_F32, DataType::F16, DataType::F32, DynamicFusionCLCastToF32Fixture<half>, CastF16toF32Dataset, zero_tolerance)
+
+// F32
+CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, DynamicFusionCLCastToF16Fixture<float>, CastF32toF16Dataset, zero_tolerance)
+
+TEST_SUITE_END() // CAST
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Clamp.cpp b/tests/validation/dynamic_fusion/gpu/cl/Clamp.cpp
new file mode 100644
index 0000000000..cef8b87c3f
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Clamp.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/ClampFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr float                    epsilon = 1e-6f;
+constexpr AbsoluteTolerance<float> tolerance(epsilon);
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(CLAMP)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Minimum value larger than maximum value
+                                                    }),
+                framework::dataset::make("MinVal", { 0.2f,
+                                                     1.5f,
+                                                     9.0f,
+                                                    })),
+                framework::dataset::make("MaxVal", { 0.5f,
+                                                     2.0f,
+                                                     1.0f,
+                                                    })),
+                framework::dataset::make("Expected", { true, true, false })),
+                input_info, min_val, max_val, expected)
+{
+    // Create a new workload sketch
+    CLCompileContext cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Fuse Clamp
+    const ITensorInfo* src_info = context.create_tensor_info(input_info);
+
+    ClampAttributes attributes {};
+    attributes.min_val(min_val)
+              .max_val(max_val);
+
+    const bool res = static_cast<bool>(GpuClamp::validate_op(sketch, src_info, attributes));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionClampOpFixture = DynamicFusionClampValidationFixture<CLTensor, CLAccessor, GpuClamp, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionClampOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.1f).max_val(0.6f)})),
+                                       framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionClampOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::Small5dShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.1f).max_val(0.6f)})),
+                                       framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionClampOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.2f).max_val(0.4f)})),
+                                       framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionClampOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.3f).max_val(0.7f)})),
+                                       framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionClampOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::Small5dShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.3f).max_val(0.7f)})),
+                                       framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionClampOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallShapes(),
+                                               framework::dataset::make(
+                                                   "ClampAttributes", {ClampAttributes().min_val(0.1f).max_val(0.9f)})),
+                                       framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance);
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // CLAMP
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
new file mode 100644
index 0000000000..2f8c639cea
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/DepthwiseConvolutionLayerDataset.h"
+#include "tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+const auto depth_multipliers       = framework::dataset::make("DepthMultiplier", {1U, 4U});
+const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", {1, 2, 5, 8});
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(DEPTHWISE_CONV2D)
+
+RelativeTolerance<float> tolerance_f32(
+    0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(
+    0.1)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr float                     tolerance_num = 0.02f; /**< Tolerance number */
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(                                                                  // Explanations of failing tests
+                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // Mismatching data type input/weights
+                                                        TensorInfo(TensorShape(3U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // Mismatching input feature maps
+                                                        TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // Mismatching depth multiplier
+                                                        TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // Invalid biases size
+                                                        TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // Invalid biases dimensions
+                                                        TensorInfo(TensorShape(8U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),                  // dilation < 1
+                                                        TensorInfo(TensorShape(8U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC),              // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC),       // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QSYMM16, DataLayout::NHWC),              // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QSYMM8, DataLayout::NHWC),               // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QSYMM8_PER_CHANNEL, DataLayout::NHWC),   // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QASYMM16, DataLayout::NHWC),             // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::U8, DataLayout::NHWC),                   // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::S8, DataLayout::NHWC),                   // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::U16, DataLayout::NHWC),                  // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::S16, DataLayout::NHWC),                  // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::U32, DataLayout::NHWC),                  // Unsupported data type
+                                                        TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::S32, DataLayout::NHWC),                  // Unsupported data type
+                                                        TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::F32, DataLayout::NCHW),                  // Unsupported data layout
+                                                        TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),              // weight dimension > 3
+                                                        TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                      }),
+                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F16, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(16U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(16U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QASYMM8, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QSYMM16, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QSYMM8, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QSYMM8_PER_CHANNEL, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QASYMM16, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::U8, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::S8, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::U16, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::S16, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::U32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::S32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::F32, DataLayout::NCHW),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U, 5U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                          TensorInfo(TensorShape(24U, 4U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                        })),
+                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(2U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(2U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(2U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(16U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(16U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NCHW),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC),
+                                                       })),
+                framework::dataset::make("Padding", {  Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(0, 0, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(1, 1, 0, 0),
+                                                       Padding2D(2, 1, 2, 1),
+                                                       Padding2D(2, 1, 2, 1),
+                                                       Padding2D(2, 1, 2, 1),
+                                                      })),
+                framework::dataset::make("Stride", {   Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(1, 1),
+                                                       Size2D(2, 3),
+                                                       Size2D(2, 3),
+                                                      })),
+                framework::dataset::make("DepthMultiplier", { 1,
+                                                              1,
+                                                              3,
+                                                              1,
+                                                              1,
+                                                              2,
+                                                              2,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                              3,
+                                                             })),
+                       framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(0U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(1U, 1U),
+                                                              Size2D(2U, 3U),
+                                                             })),
+                framework::dataset::make("Expected", { false, false, false, false, false, false, true, false,
+                                                       false, false, false, false, false, false, false, false, false, false,
+                                                       false, false, true, false, true, true, true })),
+                input_info, weights_info, biases_info, padding, stride, depth_multiplier, dilation, expected)
+{
+    CLCompileContext cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    const ITensorInfo* sketch_input_info   = context.create_tensor_info(input_info);
+    const ITensorInfo* sketch_weights_info = context.create_tensor_info(weights_info);
+    const ITensorInfo* sketch_biases_info  = context.create_tensor_info(biases_info);
+
+    DepthwiseConv2dAttributes attributes {};
+    attributes.pad(padding)
+              .stride(stride)
+              .dilation(dilation)
+              .depth_multiplier(depth_multiplier);
+
+    const Status status = GpuDepthwiseConv2d::validate_op(sketch, sketch_input_info, sketch_weights_info, sketch_biases_info, attributes);
+    const bool res = bool(status);
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionGpuDepthwiseConv2dFixture =
+    DynamicFusionGpuDepthwiseConv2dValidationFixture<CLTensor, CLAccessor, GpuDepthwiseConv2d, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+TEST_SUITE(W3x3)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE(Dilation)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                               depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // Dilation
+TEST_SUITE_END() // W3x3
+
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
+TEST_SUITE(Dilation)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                               depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F16)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END() // Dilation
+TEST_SUITE_END() // Generic
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+TEST_SUITE(W3x3)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(Dilation)
+
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                               depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // Dilation
+TEST_SUITE_END() // W3x3
+
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeKernelSize,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset(),
+                                               framework::dataset::make("DepthMultiplier", {1})),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(Dilation)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                               depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuDepthwiseConv2dFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                               large_depth_multipliers),
+                                       framework::dataset::make("DataType", DataType::F32)),
+                               framework::dataset::make("DataLayout", {DataLayout::NHWC})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // Dilation
+TEST_SUITE_END() // Generic
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // DEPTHWISE_CONV2D
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
new file mode 100644
index 0000000000..b843764786
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tests/AssetsLibrary.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/SmallConvolutionLayerDataset.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/** Tolerances from tests/validation/CL/DirectConvolutionLayer.cpp
+ */
+RelativeTolerance<float> tolerance_f32(
+    0.05f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(
+    0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr float                     abs_tolerance_f32(0.0001f); /**< Absolute tolerance for FP32 tests*/
+constexpr float                     tolerance_num = 0.07f;      /**< Tolerance number */
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+/** Synced with tests/validation/CL/ConvolutionLayer.cpp
+ *
+ * Difference                       | Why the difference
+ * f32 tolerance here is smaller    | To use the same tolerance as that of DirectConv2d; lowering tolerance is safe
+ * No quantized tests               | Not supported yet
+ * No grouped CNN tests             | Not supported yet
+ * No mixed layout tests            | Not needed; only NHWC is supported
+ * No activation                    | Not needed in fusion
+ * No ValidateConvolutionMethod     | Only a single method (direct conv2d) is supported
+ * No ReshapeWeights = true tests   | Not applicable yet. This parameter only concerns gemm-based conv2d
+ * No RunSmallWithPadding tests     | Padding is removed
+ *
+ */
+TEST_SUITE(CONV2D)
+
+template <typename T>
+using DynamicFusionGpuConv2dFixture = DynamicFusionGpuConv2dValidationFixture<CLTensor, CLAccessor, GpuConv2d, T>;
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuConv2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                               framework::dataset::make("DataType", DataType::F32)),
+                                       framework::dataset::make("DataLayout", {DataLayout::NHWC})),
+                               framework::dataset::make("QuantizationInfo", QuantizationInfo())))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuConv2dFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                               framework::dataset::make("DataType", DataType::F16)),
+                                       framework::dataset::make("DataLayout", {DataLayout::NHWC})),
+                               framework::dataset::make("QuantizationInfo", QuantizationInfo())))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END() // FP16
+
+// Tests for specific conv2d methods
+/** Synced with tests/validation/CL/DirectConvolutionLayer.cpp
+ *
+ * Difference                       | Why the difference
+ * No quantized tests               | Not supported yet
+ * No Invalid output size test      | Not applicable. Output is removed from the interface
+ * No mixed layout/NCHW tests       | Not needed; only NHWC is supported
+ * No activation tests              | Not needed in fusion
+ */
+TEST_SUITE(DIRECT_CONV2D)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Invalid: Mismatching data type input/weights
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Invalid: Mismatching input feature maps
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Invalid weights dimensions
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Unsupported biases size
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Unsupported biases dimensions
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, DataLayout::NCHW),       // Unsupported data layout: NCHW
+                                                       TensorInfo(TensorShape(2U, 32U, 16U), 1, DataType::QASYMM8, DataLayout::NHWC),   // Unsupported data type: quantized
+                                                       TensorInfo(TensorShape(2U, 32U, 16U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Arbitrary weight sizes for NHWC are supported
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Non-rectangular weights dimensions for NHWC are supported
+                                                       TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),       // Strides > 2 for any kernel sizes for NHWC are supported
+                                                     }),
+               framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(2U, 3U, 3U, 4U), 1, DataType::F16, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(3U, 3U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 3U, 3U, 4U, 3U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 3U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 3U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32, DataLayout::NCHW),
+                                                        TensorInfo(TensorShape(2U, 1U, 1U, 4U), 1, DataType::QASYMM8, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 1U, 1U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 13U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 5U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                        TensorInfo(TensorShape(2U, 3U, 3U, 4U), 1, DataType::F32, DataLayout::NHWC),
+                                                     })),
+               framework::dataset::make("BiasesInfo",{ TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(25U), 1, DataType::F32, DataLayout::NCHW),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::QASYMM8, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                       TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC),
+                                                     })),
+               framework::dataset::make("Conv2dAttributes",  {
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({1, 1}).pad({0, 0, 0, 0}),
+                                                        Conv2dAttributes().stride({3, 3}).pad({0, 0, 0, 0}),
+                                                      })),
+               framework::dataset::make("Expected", { false, false, false, false, false, false, false, true, true, true, true })),
+               input_info, weights_info, biases_info, conv2d_attrs, expected)
+{
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    const ITensorInfo* sketch_input_info   = context.create_tensor_info(input_info);
+    const ITensorInfo* sketch_weights_info = context.create_tensor_info(weights_info);
+    const ITensorInfo* sketch_biases_info  = context.create_tensor_info(biases_info);
+    bool is_valid = bool(GpuConv2d::validate_op(sketch, sketch_input_info, sketch_weights_info, sketch_biases_info, conv2d_attrs));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+template <typename T>
+using DynamicFusionGpuDirectConv2dFixture = DynamicFusionDirectConv2dValidationFixture<CLTensor, CLAccessor, GpuConv2d, T>;
+
+TEST_SUITE(FP16)
+/// TODO: COMPMID-6877: Once the issue in Conv2d is resolved, re-enable these
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDirectConv2dFixture<half>, framework::DatasetMode::DISABLED,
+               combine(combine(combine(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
+               framework::dataset::make("StrideX", { 1, 3, 1, 1 })),
+               framework::dataset::make("StrideY", { 1, 3, 2, 1 })),
+               framework::dataset::make("PadX", { 1, 3, 0, 4 })),
+               framework::dataset::make("PadY", { 1, 3, 0, 4 })),
+               framework::dataset::make("KernelSize", { 3, 8, 1, 9 })),
+               framework::dataset::make("NumKernels", { 17, 3, 1, 19 })),
+               framework::dataset::make("DataType",  DataType::F16)),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDirectConv2dFixture<half>, framework::DatasetMode::NIGHTLY,
+               combine(combine(combine(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ),
+               framework::dataset::make("StrideX", { 1 })),
+               framework::dataset::make("StrideY", { 1 })),
+               framework::dataset::make("PadX", { 1 })),
+               framework::dataset::make("PadY", { 1 })),
+               framework::dataset::make("KernelSize", { 9 })),
+               framework::dataset::make("NumKernels", { 3 })),
+               framework::dataset::make("DataType",  DataType::F16)),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+/// TODO: COMPMID-6877: Once the issue in Conv2d is resolved, re-enable these
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDirectConv2dFixture<float>, framework::DatasetMode::DISABLED,
+               combine(combine(combine(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(27U, 13U, 23U),
+                                                        TensorShape(19U, 5U, 16U, 4U),
+                                                        TensorShape(13U, 5U, 17U, 2U),
+                                                        TensorShape(32U, 37U, 13U) } ),
+               framework::dataset::make("StrideX", { 1, 3, 1, 1 })),
+               framework::dataset::make("StrideY", { 1, 3, 2, 1 })),
+               framework::dataset::make("PadX", { 1, 3, 0, 4 })),
+               framework::dataset::make("PadY", { 1, 3, 0, 4 })),
+               framework::dataset::make("KernelSize", { 3, 8, 1, 9 })),
+               framework::dataset::make("NumKernels", { 17, 3, 1, 19 })),
+               framework::dataset::make("DataType",  DataType::F32)),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.0, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDirectConv2dFixture<float>, framework::DatasetMode::NIGHTLY,
+               combine(combine(combine(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ),
+               framework::dataset::make("StrideX", { 1 })),
+               framework::dataset::make("StrideY", { 1 })),
+               framework::dataset::make("PadX", { 1 })),
+               framework::dataset::make("PadY", { 1 })),
+               framework::dataset::make("KernelSize", { 9 })),
+               framework::dataset::make("NumKernels", { 3 })),
+               framework::dataset::make("DataType",  DataType::F32)),
+               framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.0, abs_tolerance_f32);
+}
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // DIRECT_CONV2D
+TEST_SUITE_END() // CONV2D
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
new file mode 100644
index 0000000000..82d66ca6ce
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tests/AssetsLibrary.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/MatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h"
+#include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/Validation.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> tolerance_f32(
+    0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+constexpr float abs_tolerance_f32(
+    0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for floating point data types in case using relative tolerance fails because of small values */
+constexpr float abs_tolerance_f16(
+    0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data types in case using relative tolerance fails because of small values */
+RelativeTolerance<half_float::half> tolerance_f16(half(
+    0.02)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+} // namespace
+
+/** M0 values to test - precommit */
+const auto m0_values_lhs_nt_precommit = framework::dataset::make("M0", {1, 2, 3});
+
+/** N0 values to test - precommit */
+const auto n0_values_rhs_t_precommit = framework::dataset::make("N0", {1, 2, 4});
+
+/** K0 values to test - precommit */
+const auto k0_values_rhs_t_precommit = framework::dataset::make("K0", {1, 2, 4});
+
+/** M0 values to test - nightly */
+const auto m0_values_lhs_nt_nightly = framework::dataset::make("M0", {1, 2, 3, 4});
+
+/** N0 values to test - nightly */
+const auto n0_values_rhs_t_nightly = framework::dataset::make("N0", {1, 2, 3, 4, 8});
+
+/** K0 values to test - nightly */
+const auto k0_values_rhs_t_nightly = framework::dataset::make("K0", {1, 2, 3, 4, 8});
+
+class DFMatMulDataset final : public datasets::MatMulDataset
+{
+public:
+    DFMatMulDataset()
+    {
+        // LHS = [K, M], RHS = [N, K], DST = [N, M]
+        add_config(TensorShape(1U, 1U), TensorShape(1U, 1U), TensorShape(1U, 1U));
+        add_config(TensorShape(1U, 2U), TensorShape(2U, 1U), TensorShape(2U, 2U));
+        add_config(TensorShape(9U, 6U), TensorShape(5U, 9U), TensorShape(5U, 6U));
+        add_config(TensorShape(32U, 37U), TensorShape(17U, 32U), TensorShape(17U, 37U));
+    }
+};
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+
+TEST_SUITE(MatMul)
+
+TEST_SUITE(Validate)
+TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL)
+{
+    using MatMulConfigurationPair = std::pair<MatMulKernelInfo, bool>;
+
+    const std::vector<MatMulConfigurationPair> supported_block_sizes = {
+        // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false)
+
+        // Lhs not-transposed, Rhs transposed
+        {MatMulKernelInfo(false, true, 0, 1, 1), false},  // M0 should be > 0
+        {MatMulKernelInfo(false, true, 3, 11, 1), false}, // N0 not in {1, 2, 3, 4, 8, 16}
+        {MatMulKernelInfo(false, true, 3, 7, 1), false},  // N0 not in {1, 2, 3, 4, 8, 16}
+        {MatMulKernelInfo(false, true, 3, 3, 12), false}, // K0 not in {1, 2, 3, 4, 8, 16}
+        {MatMulKernelInfo(false, true, 3, 3, 6), false},  // K0 not in {1, 2, 3, 4, 8, 16}
+        {MatMulKernelInfo(false, true, 5, 1, 2), true},   {MatMulKernelInfo(false, true, 3, 3, 3), true},
+        {MatMulKernelInfo(false, true, 2, 4, 8), true},
+
+    };
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal
+    // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here,
+    // not the shapes themselves.
+    const ITensorInfo *lhs_info = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32));
+    const ITensorInfo *rhs_info = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32));
+
+    for (auto &pair : supported_block_sizes)
+    {
+        MatMulAttributes matmul_attr{};
+        matmul_attr.adj_lhs(pair.first.adj_lhs);
+        matmul_attr.adj_rhs(pair.first.adj_rhs);
+
+        GpuMatMulSettings matmul_settings{};
+        matmul_settings.m0(pair.first.m0);
+        matmul_settings.n0(pair.first.n0);
+        matmul_settings.k0(pair.first.k0);
+
+        Status status = GpuMatMul::validate_op(sketch, lhs_info, rhs_info, matmul_attr, matmul_settings);
+        ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
+{
+    // Create a sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using ShapeConfigurationTuple                                   = std::tuple<TensorShape, TensorShape, bool>;
+    const std::vector<ShapeConfigurationTuple> shape_configurations = {
+        {TensorShape(5U, 1U), TensorShape(3U, 5U), true},
+        {TensorShape(10U, 12U), TensorShape(3U, 10U), true},
+        {TensorShape(8U, 4U), TensorShape(2U, 8U), true},
+        {TensorShape(8U, 4U), TensorShape(2U, 5U), false}, // Mismatch in the K dimension
+        {TensorShape(5U, 0U), TensorShape(2U, 5U), false}, // Invalid dimension
+        {TensorShape(5U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), true},
+        {TensorShape(5U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), false}, // no batch broadcasting
+        {TensorShape(5U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U),
+         false}, // mismatch in batch dimension
+    };
+
+    for (auto &tuple : shape_configurations)
+    {
+        const bool expected = std::get<2>(tuple);
+
+        for (bool adj_lhs : {false})
+        {
+            for (bool adj_rhs : {true})
+            {
+                TensorShape lhs_shape = std::get<0>(tuple);
+                TensorShape rhs_shape = std::get<1>(tuple);
+
+                if (adj_lhs)
+                {
+                    permute(lhs_shape, PermutationVector(1U, 0U));
+                }
+
+                if (adj_rhs)
+                {
+                    permute(rhs_shape, PermutationVector(1U, 0U));
+                }
+
+                const ITensorInfo *lhs_info = context.create_tensor_info(TensorInfo(lhs_shape, 1, DataType::F32));
+                const ITensorInfo *rhs_info = context.create_tensor_info(TensorInfo(rhs_shape, 1, DataType::F32));
+
+                MatMulAttributes matmul_attr{};
+                matmul_attr.adj_lhs(adj_lhs);
+                matmul_attr.adj_rhs(adj_rhs);
+
+                GpuMatMulSettings matmul_settings{};
+                matmul_settings.m0(1);
+                matmul_settings.n0(1);
+                matmul_settings.k0(1);
+
+                Status status = GpuMatMul::validate_op(sketch, lhs_info, rhs_info, matmul_attr, matmul_settings);
+                ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+            }
+        }
+    }
+}
+
+TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
+{
+    // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+    using DataTypeConfigurationTuple = std::tuple<DataType, DataType, DataType, bool>;
+    const std::vector<DataTypeConfigurationTuple> data_type_configurations = {
+        {DataType::F32, DataType::F32, DataType::F32, true},
+        {DataType::F16, DataType::F16, DataType::F16, true},
+        {DataType::F16, DataType::F32, DataType::F32, false},                                  // no mixed precision
+        {DataType::F64, DataType::F64, DataType::F64, false},                                  // no double precision
+        {DataType::QASYMM8, DataType::QASYMM8, DataType::QASYMM8, false},                      // no quantized types
+        {DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, false}, // no quantized types
+        {DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL,
+         false},                                                             // no quantized types
+        {DataType::QASYMM16, DataType::QASYMM16, DataType::QASYMM16, false}, // no quantized types
+        {DataType::QSYMM16, DataType::QSYMM16, DataType::QSYMM16, false},    // no quantized types
+        {DataType::QSYMM8, DataType::QSYMM8, DataType::QSYMM8, false},       // no quantized types
+        {DataType::S64, DataType::S64, DataType::S64, false},                // no integral types
+        {DataType::S32, DataType::S32, DataType::S32, false},                // no integral types
+        {DataType::S16, DataType::S16, DataType::S16, false},                // no integral types
+        {DataType::S8, DataType::S8, DataType::S8, false},                   // no integral types
+        {DataType::U64, DataType::U64, DataType::U64, false},                // no integral types
+        {DataType::U32, DataType::U32, DataType::U32, false},                // no integral types
+        {DataType::U16, DataType::U16, DataType::U16, false},                // no integral types
+        {DataType::U8, DataType::U8, DataType::U8, false},                   // no integral types
+    };
+    // Create a sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    const TensorShape shape = TensorShape(10U, 10U);
+    MatMulAttributes  matmul_attr{};
+    matmul_attr.adj_lhs(false);
+    matmul_attr.adj_rhs(false);
+    GpuMatMulSettings matmul_settings{};
+    matmul_settings.m0(1);
+    matmul_settings.n0(1);
+    matmul_settings.k0(1);
+
+    for (auto &tuple : data_type_configurations)
+    {
+        const bool expected = std::get<3>(tuple);
+
+        const ITensorInfo *lhs_info = context.create_tensor_info(TensorInfo(shape, 1, std::get<0>(tuple)));
+        const ITensorInfo *rhs_info = context.create_tensor_info(TensorInfo(shape, 1, std::get<1>(tuple)));
+
+        Status status = GpuMatMul::validate_op(sketch, lhs_info, rhs_info, matmul_attr, matmul_settings);
+        ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_SUITE_END() // Validate
+
+template <typename T>
+using DynamicFusionGpuMatmulFixture = DynamicFusionGpuMatMulValidationFixture<CLTensor, CLAccessor, GpuMatMul, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunPrecommit,
+                       DynamicFusionGpuMatmulFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_precommit,
+                               n0_values_rhs_t_precommit,
+                               k0_values_rhs_t_precommit,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunNightly,
+                       DynamicFusionGpuMatmulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_nightly,
+                               n0_values_rhs_t_nightly,
+                               k0_values_rhs_t_nightly,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunPrecommit,
+                       DynamicFusionGpuMatmulFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_precommit,
+                               n0_values_rhs_t_precommit,
+                               k0_values_rhs_t_precommit,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunNightly,
+                       DynamicFusionGpuMatmulFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_nightly,
+                               n0_values_rhs_t_nightly,
+                               k0_values_rhs_t_nightly,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // MatMul
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp b/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
new file mode 100644
index 0000000000..af02ce3eaa
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/DynamicFusionDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/MulFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/* Synced with tests/validation/CL/PixelwiseMultiplication.cpp from the standard interface.
+ *
+ * Difference              | Why the difference
+ * No integer tests        | Not supported yet
+ * No quantized tests      | Not supported yet
+ * No convert policy tests | Not needed as convert policy is ignored by floating types
+ * No scale tests          | Not supported yet
+ * No rounding modes tests | Not supported yet
+ * No in place tests       | Not supported yet
+ * No activation tests     | Not needed in dynamic fusion interface
+ *
+ */
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_f16(
+    0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr AbsoluteTolerance<float> tolerance_f32(
+    0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+} // namespace
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(MUL)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),     // Unsupported data type U8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),     // Unsupported data type S8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // Unsupported data type S16
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // Unsupported data type S32
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8_SIGNED
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
+                                                        TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32),    // Broadcast Y dimension is not allowed
+                                                        TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::F32),    // Broadcast Z dimension is not allowed
+                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed
+                                                      }),
+               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for rhs
+                                                       TensorInfo(TensorShape(15U,  1U, 3U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape( 3U,  8U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("Expected", { true, true, false, false, false, false, false, false, false, false, true, true, false, false, true })),
+               input1_info, input2_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Validate Elementwise Mul
+    auto          lhs_info         = context.create_tensor_info(input1_info);
+    auto          rhs_info         = context.create_tensor_info(input2_info);
+
+    bool res = bool(GpuMul::validate_op(sketch, lhs_info, rhs_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionCLMulFixture = DynamicFusionMulOneOpValidationFixture<CLTensor, CLAccessor, GpuMul, T>;
+template <typename T>
+using DynamicFusionCLMulBroadcastFixture = DynamicFusionMulBroadcastValidationFixture<CLTensor, CLAccessor, GpuMul, T>;
+template <typename T>
+using DynamicFusionCLMulTwoOpsFixture = DynamicFusionMulTwoOpsValidationFixture<CLTensor, CLAccessor, GpuMul, T>;
+
+TEST_SUITE(F16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLMulFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::TemporaryLimitedSmallShapesBroadcast(),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::TemporaryLimitedLargeShapesBroadcast(),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // F16
+
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLMulFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeOneOp,
+                       DynamicFusionCLMulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::TemporaryLimitedSmallShapesBroadcast(),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::TemporaryLimitedLargeShapesBroadcast(),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionCLMulTwoOpsFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes(),
+                                               framework::dataset::make("DataType", {DataType::F32})),
+                                       framework::dataset::make("InPlace", {false})),
+                               framework::dataset::make("FuseTwoOps", {true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // F32
+
+TEST_SUITE_END() // MUL
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
new file mode 100644
index 0000000000..be816b32b3
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/dynamic_fusion/PoolingLayerDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(POOL2D)
+
+constexpr AbsoluteTolerance<float> tolerance_f32(
+    0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+constexpr AbsoluteTolerance<float> tolerance_f16(
+    0.01f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+
+const auto PoolingLayerDatasetFP =
+    combine(combine(combine(combine(framework::dataset::make("PoolingType", {PoolingType::MAX, PoolingType::AVG}),
+                                    framework::dataset::make("PoolingSize", {Size2D(2, 2), Size2D(3, 3)})),
+                            framework::dataset::make("Pad", {Padding2D()})),
+                    framework::dataset::make("Stride", {Size2D(1, 1), Size2D(2, 1), Size2D(5, 7)})),
+            framework::dataset::make("ExcludePadding", {true}));
+
+template <typename T>
+using DynamicFusionGpuPool2dFixture = DynamicFusionGpuPool2dValidationFixture<CLTensor, CLAccessor, GpuPool2d, T>;
+
+template <typename T>
+using DFSpecialGpuPool2dFixture = DynamicFusionGpuPool2dSpecialValidationFixture<CLTensor, CLAccessor, GpuPool2d, T>;
+// *INDENT-OFF*
+// clang-format off
+
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+            framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC), // Invalid parameters, unsupported pooling
+                                                    TensorInfo(TensorShape(5U, 15U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Valid Non-rectangular Global Pooling
+                                                    TensorInfo(TensorShape(5U, 13U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC), // Invalid - Quantized not supported.
+                                                    TensorInfo(TensorShape(5U, 13U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Valid global pooling
+                                                    TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32, DataLayout::NCHW),     // Unsupported data layout
+                                                }),
+            framework::dataset::make("Pool2dAttributes", {
+                                                    Pool2dAttributes().pool_type(PoolingType::L2).pool_size(Size2D(3,3)).pad(Padding2D(0,0,0,0)).stride(Size2D(1,1)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(15U, 13U)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(2,2)).pad(Padding2D()).stride(Size2D(1,1)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(13U,13U)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(13U,13U)),
+                                                })),
+            framework::dataset::make("Expected", { false, true, false, true, false })),
+            input_info, pool2d_attr, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Declare GpuPool2d settings
+    const GpuPool2dSettings &settings = GpuPool2dSettings();
+
+    // Validate Pool2d Configuration
+    auto                   src_info    = context.create_tensor_info(input_info);
+    bool                   res         = bool(GpuPool2d::validate_op(sketch, src_info, pool2d_attr, settings));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuPool2dFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallNoneUnitShapes(), PoolingLayerDatasetFP),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionGpuPool2dFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::LargeShapes(), PoolingLayerDatasetFP),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSpecial,
+                       DFSpecialGpuPool2dFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::PoolingLayerDatasetSpecialDynamicFusion(),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(GlobalPooling)
+FIXTURE_DATA_TEST_CASE(
+    RunSmall,
+    DynamicFusionGpuPool2dFixture<float>,
+    framework::DatasetMode::ALL,
+    combine(combine(combine(combine(combine(combine(framework::dataset::make("InputShape",
+                                                                             {TensorShape(27U, 13U, 2U),
+                                                                              TensorShape(27U, 13U, 2U, 4U)}),
+                                                    framework::dataset::make("PoolingType",
+                                                                             {PoolingType::AVG, PoolingType::MAX})),
+                                            framework::dataset::make("PoolingSize", {Size2D(27, 13)})),
+                                    framework::dataset::make("Pad", {Padding2D()})),
+                            framework::dataset::make("Stride", {Size2D(1, 1)})),
+                    framework::dataset::make("ExcludePadding", true)),
+            framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(
+    RunLarge,
+    DynamicFusionGpuPool2dFixture<float>,
+    framework::DatasetMode::NIGHTLY,
+    combine(combine(combine(combine(combine(combine(framework::dataset::make("InputShape",
+                                                                             {TensorShape(79U, 37U, 11U),
+                                                                              TensorShape(79U, 37U, 11U, 4U)}),
+                                                    framework::dataset::make("PoolingType",
+                                                                             {PoolingType::AVG, PoolingType::MAX})),
+                                            framework::dataset::make("PoolingSize", {Size2D(79, 37)})),
+                                    framework::dataset::make("Pad", {Padding2D()})),
+                            framework::dataset::make("Stride", {Size2D(1, 1)})),
+                    framework::dataset::make("ExcludePadding", true)),
+            framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+TEST_SUITE(GlobalPooling)
+FIXTURE_DATA_TEST_CASE(
+    RunSmall,
+    DynamicFusionGpuPool2dFixture<half>,
+    framework::DatasetMode::ALL,
+    combine(combine(combine(combine(combine(combine(framework::dataset::make("InputShape",
+                                                                             {TensorShape(27U, 13U, 2U),
+                                                                              TensorShape(27U, 13U, 2U, 4U)}),
+                                                    framework::dataset::make("PoolingType",
+                                                                             {PoolingType::AVG, PoolingType::MAX})),
+                                            framework::dataset::make("PoolingSize", {Size2D(27, 13)})),
+                                    framework::dataset::make("Pad", {Padding2D()})),
+                            framework::dataset::make("Stride", {Size2D(1, 1)})),
+                    framework::dataset::make("ExcludePadding", true)),
+            framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(
+    RunLarge,
+    DynamicFusionGpuPool2dFixture<half>,
+    framework::DatasetMode::NIGHTLY,
+    combine(combine(combine(combine(combine(combine(framework::dataset::make("InputShape",
+                                                                             {TensorShape(79U, 37U, 11U),
+                                                                              TensorShape(79U, 37U, 11U, 4U)}),
+                                                    framework::dataset::make("PoolingType",
+                                                                             {PoolingType::AVG, PoolingType::MAX})),
+                                            framework::dataset::make("PoolingSize", {Size2D(79, 37)})),
+                                    framework::dataset::make("Pad", {Padding2D()})),
+                            framework::dataset::make("Stride", {Size2D(1, 1)})),
+                    framework::dataset::make("ExcludePadding", true)),
+            framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // FLOAT
+
+TEST_SUITE_END() // POOL2D
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp b/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
new file mode 100644
index 0000000000..d46754ccca
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ReshapeLayerDataset.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/ReshapeFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(RESHAPE)
+
+DATA_TEST_CASE(Validate,
+               framework::DatasetMode::DISABLED,
+               zip(zip(framework::dataset::make(
+                           "InputInfo",
+                           {
+                               TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),
+                               TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
+                               TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32) /*mismatching dimensions*/,
+                           }),
+                       framework::dataset::make("OutputShape",
+                                                {
+                                                    TensorShape(9U, 5U, 21U),
+                                                    TensorShape(8U, 24U, 4U),
+                                                    TensorShape(192U, 192U),
+                                                })),
+                   framework::dataset::make("Expected", {true, true, false})),
+               input_info,
+               output_shape,
+               expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch sketch{&context};
+
+    // Create sketch tensors
+    TensorShape input_shape = input_info.tensor_shape();
+    ARM_COMPUTE_UNUSED(input_shape);
+    ITensorInfo *src_info = context.create_tensor_info(input_info);
+
+    ReshapeAttributes attributes;
+    attributes.shape(output_shape);
+    Status status = GpuReshape::validate_op(sketch, src_info, attributes);
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+
+template <typename T>
+using DynamicFusionGpuReshapeLayerFixture =
+    DynamicFusionGpuReshapeLayerValidationFixture<CLTensor, CLAccessor, GpuReshape, T>;
+
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuReshapeLayerFixture<float>,
+                       framework::DatasetMode::DISABLED,
+                       combine(datasets::SmallReshapeLayerDataset(),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // F32
+
+TEST_SUITE(F16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuReshapeLayerFixture<half>,
+                       framework::DatasetMode::DISABLED,
+                       combine(datasets::SmallReshapeLayerDataset(),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // F16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuReshapeLayerFixture<uint8_t>,
+                       framework::DatasetMode::DISABLED,
+                       combine(datasets::SmallReshapeLayerDataset(),
+                               framework::dataset::make("DataType", DataType::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
+
+TEST_SUITE(S8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuReshapeLayerFixture<int8_t>,
+                       framework::DatasetMode::DISABLED,
+                       combine(datasets::SmallReshapeLayerDataset(),
+                               framework::dataset::make("DataType", DataType::S8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S8
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionGpuReshapeLayerFixture<int16_t>,
+                       framework::DatasetMode::DISABLED,
+                       combine(datasets::SmallReshapeLayerDataset(),
+                               framework::dataset::make("DataType", DataType::S16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE_END() // RESHAPE
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp b/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
new file mode 100644
index 0000000000..a6bcf4ae26
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
@@ -0,0 +1,359 @@
+/*
+* Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ScaleValidationDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/ResizeFixture.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+using datasets::ScaleAlignCornersSamplingPolicySet;
+using datasets::ScaleInterpolationPolicySet;
+using datasets::ScaleSamplingPolicySet;
+using datasets::ScaleShapesBaseDataSet;
+
+/** We consider vector size in byte 16 since the maximum size of
+ * a vector used by @ref CLScaleKernel is currently 16-byte (float4).
+ */
+constexpr uint32_t vector_byte = 16;
+
+template <typename T>
+constexpr uint32_t num_elements_per_vector()
+{
+    return vector_byte / sizeof(T);
+}
+
+/** Quantization information data set */
+const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
+                                                          {
+                                                              QuantizationInfo(0.5f, -1),
+                                                          });
+
+/** Tolerance */
+constexpr float tolerance_f32_absolute(0.001f);
+
+RelativeTolerance<float> tolerance_f32(0.05);
+constexpr float          abs_tolerance_f16(0.1f);
+RelativeTolerance<half>  tolerance_f16(half(0.1));
+
+constexpr float tolerance_num_f32(0.01f);
+
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(RESIZE)
+
+TEST_SUITE(Validate)
+
+const auto default_input_shape  = TensorShape{2, 3, 3, 2};
+const auto default_output_shape = TensorShape{4, 6, 3, 2};
+
+constexpr auto default_data_type   = DataType::U8;
+constexpr auto default_data_layout = DataLayout::NHWC;
+
+TEST_CASE(NullPtr, framework::DatasetMode::ALL)
+{
+    const TensorInfo input_info  = TensorInfo{default_input_shape, 1, default_data_type, default_data_layout};
+    const TensorInfo output_info = TensorInfo{default_output_shape, 1, default_data_type, default_data_layout};
+
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch  sketch{&context};
+
+    // nullptr is given as input
+    Status status = GpuResize::validate_op(sketch, nullptr, ResizeAttributes());
+    ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
+{
+    const std::map<DataType, bool> supported_data_types =
+    {
+        { DataType::U8, false },
+        { DataType::S8, false },
+        { DataType::QSYMM8, false },
+        { DataType::QASYMM8, false },
+        { DataType::QASYMM8_SIGNED, false },
+        { DataType::QSYMM8_PER_CHANNEL, false },
+        { DataType::U16, false },
+        { DataType::S16, false },
+        { DataType::QSYMM16, false },
+        { DataType::QASYMM16, false },
+        { DataType::U32, false },
+        { DataType::S32, false },
+        { DataType::U64, false },
+        { DataType::S64, false },
+        { DataType::BFLOAT16, false },
+        { DataType::F16, true },
+        { DataType::F32, true },
+        { DataType::F64, false },
+        { DataType::SIZET, false },
+    };
+
+    for (auto &kv : supported_data_types)
+    {
+        const TensorInfo input_info = TensorInfo{default_input_shape, 1, kv.first, default_data_layout};
+
+        CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+        GpuWorkloadSketch  sketch{&context};
+
+        const ITensorInfo *sketch_input_info = context.create_tensor_info(input_info);
+
+        ResizeAttributes attributes;
+        attributes.output_width(default_output_shape[0]); // shape is not important unless it's empty
+        attributes.output_height(default_output_shape[1]);
+
+        Status status = GpuResize::validate_op(sketch, sketch_input_info, attributes);
+        ARM_COMPUTE_EXPECT(bool(status) == kv.second, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_CASE(MismatchingDataType, framework::DatasetMode::ALL)
+{
+    constexpr DataType non_default_data_type = DataType::F32;
+
+    const TensorInfo input_info  = TensorInfo{default_input_shape, 1, default_data_type, default_data_layout};
+    const TensorInfo output_info = TensorInfo{default_output_shape, 1, non_default_data_type, default_data_layout};
+
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch  sketch{&context};
+
+    const ITensorInfo *sketch_input_info = context.create_tensor_info(input_info);
+
+    Status status = GpuResize::validate_op(sketch, sketch_input_info, ResizeAttributes());
+    ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL)
+{
+    // Aligned corners require sampling policy to be TOP_LEFT.
+    constexpr InterpolationPolicy interpolation_policy = InterpolationPolicy::BILINEAR;
+    constexpr bool                align_corners        = true;
+    constexpr SamplingPolicy      sampling_policy      = SamplingPolicy::CENTER;
+
+    const TensorInfo input_info  = TensorInfo{default_input_shape, 1, default_data_type, default_data_layout};
+    const TensorInfo output_info = TensorInfo{default_output_shape, 1, default_data_type, default_data_layout};
+
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch  sketch{&context};
+
+    const ITensorInfo *sketch_input_info = context.create_tensor_info(input_info);
+
+    ResizeAttributes attributes{};
+    attributes.interpolation_policy(interpolation_policy).sampling_policy(sampling_policy).align_corners(align_corners);
+
+    Status status = GpuResize::validate_op(sketch, sketch_input_info, attributes);
+    ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(UnsupportedInterpolationPolicy, framework::DatasetMode::ALL)
+{
+    const TensorInfo input_info  = TensorInfo{TensorShape(28U, 33U, 2U), 1, DataType::F32, default_data_layout};
+    const TensorInfo output_info = TensorInfo{TensorShape(26U, 21U, 2U), 1, DataType::F32, default_data_layout};
+    constexpr auto   interpolation_policy = InterpolationPolicy::AREA;
+
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch  sketch{&context};
+
+    const ITensorInfo *sketch_input_info = context.create_tensor_info(input_info);
+
+    ResizeAttributes attributes{};
+    attributes.interpolation_policy(interpolation_policy);
+
+    Status status = GpuResize::validate_op(sketch, sketch_input_info, attributes);
+    ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(UnsupportedLayout, framework::DatasetMode::ALL)
+{
+    const TensorInfo input_info           = TensorInfo{default_input_shape, 1, default_data_type, DataLayout::NCHW};
+    const TensorInfo output_info          = TensorInfo{default_output_shape, 1, default_data_type, DataLayout::NCHW};
+    constexpr auto   interpolation_policy = InterpolationPolicy::BILINEAR;
+
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{&cl_compile_ctx};
+    GpuWorkloadSketch  sketch{&context};
+
+    const ITensorInfo *sketch_input_info = context.create_tensor_info(input_info);
+
+    ResizeAttributes attributes{};
+    attributes.interpolation_policy(interpolation_policy);
+
+    Status status = GpuResize::validate_op(sketch, sketch_input_info, attributes);
+    ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE_END() // Validate
+
+template <typename T>
+using DynamicFusionResizeFixture = DynamicFusionResizeValidationFixture<CLTensor, CLAccessor, GpuResize, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+const auto f32_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<float>())),
+                               framework::dataset::make("DataType", DataType::F32));
+
+FIXTURE_DATA_TEST_CASE(Run,
+                       DynamicFusionResizeFixture<float>,
+                       framework::DatasetMode::ALL,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f32_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
+
+FIXTURE_DATA_TEST_CASE(RunAlignCorners,
+                       DynamicFusionResizeFixture<float>,
+                       framework::DatasetMode::ALL,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f32_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
+const auto f32_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<float>())),
+                                       framework::dataset::make("DataType", DataType::F32));
+FIXTURE_DATA_TEST_CASE(RunNightly,
+                       DynamicFusionResizeFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f32_nightly_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
+FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners,
+                       DynamicFusionResizeFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f32_nightly_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+const auto f16_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<half>())),
+                               framework::dataset::make("DataType", DataType::F16));
+FIXTURE_DATA_TEST_CASE(Run,
+                       DynamicFusionResizeFixture<half>,
+                       framework::DatasetMode::ALL,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f16_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunAlignCorners,
+                       DynamicFusionResizeFixture<half>,
+                       framework::DatasetMode::ALL,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f16_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+const auto f16_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<half>())),
+                                       framework::dataset::make("DataType", DataType::F16));
+FIXTURE_DATA_TEST_CASE(RunNightly,
+                       DynamicFusionResizeFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f16_nightly_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners,
+                       DynamicFusionResizeFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       ASSEMBLE_DATASET_DYNAMIC_FUSION(f16_nightly_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region =
+        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // RESIZE
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Sigmoid.cpp b/tests/validation/dynamic_fusion/gpu/cl/Sigmoid.cpp
new file mode 100644
index 0000000000..0134a7c11b
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Sigmoid.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/ActivationFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_f32(1e-6f);
+constexpr AbsoluteTolerance<float> tolerance_f16(0.001f);
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(SIGMOID)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8), // Unsupported data type
+                                                    }),
+                framework::dataset::make("Expected", { true, true, false })),
+                input_info, expected)
+{
+    // Create a new workload sketch
+    CLCompileContext cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Fuse sigmoid
+    const ITensorInfo *src_info = context.create_tensor_info(input_info);
+
+    const bool res = static_cast<bool>(GpuSigmoid::validate_op(sketch, src_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionSigmoidOpFixture = DynamicFusionSigmoidValidationFixture<CLTensor, CLAccessor, GpuSigmoid, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionSigmoidOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionSigmoidOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::Small5dShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionSigmoidOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionSigmoidOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionSigmoidOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::Small5dShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionSigmoidOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // SIGMOID
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp b/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
new file mode 100644
index 0000000000..8f5a1ed14a
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/SoftmaxFixture.h"
+#include "tests/validation/Validation.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Tolerance for float operations */
+RelativeTolerance<half>  tolerance_f16(half(0.2));
+RelativeTolerance<float> tolerance_f32(0.001f);
+
+using framework::dataset::make;
+
+/// TODO: COMPMID-6713
+/// Softmax is not implemented in CKW. Therefore, the tests are DISABLED.
+/// Enable the tests when Softmax is implemented in CKW.
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(SOFTMAX)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::DISABLED,
+    zip(
+        make("InputInfo", {
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::S32), // Unsupported data type
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F16),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+        }),
+        make("OutputInfo",{
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+            TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM16), // Unsupported data type
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+        }),
+        make("beta", {
+            1.0,
+            2.0,
+            2.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+        }),
+        make("axis", {
+            0,
+            0,
+            1,  // Invalid as axis != 0
+            0,
+            0,
+            0,
+            -3, // Invalid as axis != 0
+            2,  // Invalid as axis != 0
+            1,  // Invalid as axis != 0
+            -1, // Invalid as axis != 0
+        }),
+        make("Expected", { false, false, false, true, false, false, false, false, false, false})),
+        input_info, output_info, beta, axis, expected)
+{
+    // Create a new workload sketch
+    CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch  sketch{ &context };
+
+    SoftmaxAttributes softmax_attr{};
+    softmax_attr.axis(axis).beta(beta).is_log_softmax(false);
+    ITensorInfo* src_info  = context.create_tensor_info(input_info);
+    ITensorInfo* dst_info = context.create_tensor_info(output_info);
+    const bool res = static_cast<bool>(GpuSoftmax::validate_op(sketch, src_info, dst_info, softmax_attr));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+
+template <typename T>
+using DynamicFusionSoftmaxLayerFixture = DynamicFusionSoftmaxValidationFixture<CLTensor, CLAccessor, GpuSoftmax, T>;
+
+TEST_SUITE(FLOAT)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+
+FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayer4DShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayer4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // FLOAT
+
+TEST_SUITE_END() // SOFTMAX
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp b/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
new file mode 100644
index 0000000000..c7ab1e717c
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/DynamicFusionDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/* Synced with tests/validation/CL/ArithmeticSubtraction.cpp from the standard interface.
+ *
+ * Difference          | Why the difference
+ * No quantized tests  | Not supported yet
+ * No in place tests   | Not supported yet
+ * No activation tests | Not needed in dynamic fusion interface
+ *
+ */
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(SUB)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U32),    // Unsupported data type U32
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
+                                                        TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32),    // Broadcast Y dimension is not allowed
+                                                        TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::S16),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed
+                                                      }),
+               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for rhs
+                                                       TensorInfo(TensorShape(15U,  1U, 3U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape( 3U,  8U, 1U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("Expected", { true, false, false, false, false, false, false, false, true, true, false, false, true })),
+               input1_info, input2_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              context        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Validate Elementwise Sub
+    auto          lhs_info         = context.create_tensor_info(input1_info);
+    auto          rhs_info         = context.create_tensor_info(input2_info);
+
+    bool res = bool(GpuSub::validate_op(sketch, lhs_info, rhs_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionCLSubFixture =
+    DynamicFusionGpuElementwiseBinaryOneOpValidationFixture<CLTensor, CLAccessor, GpuSub, T>;
+
+template <typename T>
+using DynamicFusionCLSubBroadcastFixture =
+    DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture<CLTensor, CLAccessor, GpuSub, T>;
+
+template <typename T>
+using DynamicFusionCLSubTwoOpsFixture =
+    DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture<CLTensor, CLAccessor, GpuSub, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLSubFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeOneOp,
+                       DynamicFusionCLSubFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLSubBroadcastFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLSubBroadcastFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::TemporaryLimitedLargeShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(
+    RunSmallTwoOps,
+    DynamicFusionCLSubTwoOpsFixture<float>,
+    framework::DatasetMode::PRECOMMIT,
+    combine(combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                    datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes()),
+                            framework::dataset::make("DataType", {DataType::F32})),
+                    framework::dataset::make("InPlace", {false})),
+            framework::dataset::make("FuseTwoOps", {true})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLSubFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLSubBroadcastFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", {DataType::F16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLSubFixture<int32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::S32})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLSubFixture<int16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::S16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionCLSubFixture<int16_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", {DataType::S16})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLSubFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", {ArithmeticOperation::SUB}),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", {DataType::U8})),
+                               framework::dataset::make("InPlace", {false})))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
+
+TEST_SUITE_END() // SUB
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Tanh.cpp b/tests/validation/dynamic_fusion/gpu/cl/Tanh.cpp
new file mode 100644
index 0000000000..2560f3aab1
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Tanh.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/ActivationFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_f32(0.00001f);
+constexpr AbsoluteTolerance<float> tolerance_f16(0.001f);
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(TANH)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8), // Unsupported data type
+                                                    }),
+                framework::dataset::make("Expected", { true, true, false })),
+                input_info, expected)
+{
+    // Create a new workload sketch
+    CLCompileContext cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    GpuWorkloadContext context{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &context };
+
+    // Fuse tanh
+    const ITensorInfo* src_info = context.create_tensor_info(input_info);
+
+    const bool res = static_cast<bool>(GpuTanh::validate_op(sketch, src_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionTanhOpFixture = DynamicFusionTanhValidationFixture<CLTensor, CLAccessor, GpuTanh, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionTanhOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionTanhOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::Small5dShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionTanhOpFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionTanhOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5dOneOp,
+                       DynamicFusionTanhOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::Small5dShapes(), framework::dataset::make("Fuse", {false})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    ARM_COMPUTE_TEST_INFO("Currently 5D+ tensors are unsupported for this operation.");
+    framework::ARM_COMPUTE_PRINT_INFO();
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionTanhOpFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(), framework::dataset::make("Fuse", {true})),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // TANH
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute