15 files changed, 843 insertions, 154 deletions
diff --git a/Android.bp b/Android.bp
index 34f722f6fe..78aa64a5f9 100644
--- a/Android.bp
+++ b/Android.bp
@@ -619,6 +619,7 @@ cc_library_static {
         "src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp",
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
index 6ac5d4e500..796fd6f83a 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
@@ -25,10 +25,12 @@
 #define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensorInfo.h"
 
 namespace arm_compute
 {
+/** Forward declaration */
+class ITensorInfo;
+
 namespace experimental
 {
 namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h
new file mode 100644
index 0000000000..3e0ebdd96c
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMUL
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMUL
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuMul final
+{
+public:
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs            |dst           |
+     * |:--------------|:--------------|:-------------|
+     * |F16            |F16            |F16           |
+     * |F32            |F32            |F32           |
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * @param[in,out] sketch Workload sketch into which the operator will be fused
+     * @param[in]     lhs    Left hand side tensor info. Data types supported: F16/F32.
+     * @param[in]     rhs    Right hand side tensor info. Data types supported: Same as @p lhs.
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
+                                  ITensorInfo       *lhs,
+                                  ITensorInfo       *rhs);
+
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context Workload context within which the operator is running
+     * @param[in] lhs     Left hand side tensor info. Data types supported: F16/F32.
+     * @param[in] rhs     Right hand side tensor info. Data types supported: Same as @p lhs.
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *lhs,
+                                  const ITensorInfo        *rhs);
+
+    /** Validate the operator and check if the configuration is supported and if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuMul::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *rhs,
+                              const ITensorInfo       *lhs);
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMUL */
diff --git a/filelist.json b/filelist.json
index 3cb3a7a76f..a66d2a3384 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2234,13 +2234,14 @@
       "src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
+      "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp",
+      "src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp",
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index 507e172dfb..baee0d561b 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -1063,6 +1063,7 @@
 
 #define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
 #define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
+#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
 
 /** Element-wise activation for quantized types
  *
@@ -1130,6 +1131,9 @@
 
 #define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
 
+#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
 /** Element-wise scale with a constant value
  *
  * @note Performs: LHS * constant = DST
@@ -1193,6 +1197,7 @@
 
 #define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
 #define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
 
 /** Element-wise operation between two tiles (LHS and RHS)
  *
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
index 9a218b3e75..2611d6d575 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
@@ -37,7 +37,8 @@ namespace
 {
 std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops
 {
-    ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Add,
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul
 };
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
index a02160cba8..e7ee1c10df 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
 
 namespace arm_compute
@@ -36,9 +36,13 @@ Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch,
                            const ITensorInfo       *lhs,
                            const ITensorInfo       *rhs)
 {
-    // Set the elementwise operation to ADD then call the elementwise common validate_op
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Add then call the elementwise common validate_op
     ElementwiseBinaryCommonAttributes common_attributes{};
-    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD);
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add);
     return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
 }
 
@@ -46,9 +50,13 @@ Status GpuAdd::is_supported_op(const GpuWorkloadContext &context,
                                const ITensorInfo        *lhs,
                                const ITensorInfo        *rhs)
 {
-    // Set the elementwise operation to ADD then call the elementwise common is_supported_op
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Add then call the elementwise common is_supported_op
     ElementwiseBinaryCommonAttributes common_attributes{};
-    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD);
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add);
     return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
 }
 
@@ -57,9 +65,9 @@ ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch,
                                ITensorInfo       *rhs)
 {
     // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
-    // Set the elementwise operation to ADD then call the elementwise common create_op
+    // Set the elementwise operation to Add then call the elementwise common create_op
     ElementwiseBinaryCommonAttributes common_attributes{};
-    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD);
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add);
     return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes);
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
new file mode 100644
index 0000000000..464a32cbad
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status GpuMul::validate_op(const GpuWorkloadSketch &sketch,
+                           const ITensorInfo       *lhs,
+                           const ITensorInfo       *rhs)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Mul then call the elementwise common validate_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul);
+    return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
+}
+
+Status GpuMul::is_supported_op(const GpuWorkloadContext &context,
+                               const ITensorInfo        *lhs,
+                               const ITensorInfo        *rhs)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Mul then call the elementwise common is_supported_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul);
+    return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
+}
+
+ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch,
+                               ITensorInfo       *lhs,
+                               ITensorInfo       *rhs)
+{
+    // Set the elementwise operation to Mul then call the elementwise common create_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul);
+    return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h
index cbefa379e6..0b58b6eb96 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h
+++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h
@@ -25,11 +25,12 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensorInfo.h"
 
 namespace arm_compute
 {
+/** Forward declaration */
 class ITensorInfo;
+
 namespace experimental
 {
 namespace dynamic_fusion
@@ -39,14 +40,15 @@ class ElementwiseBinaryCommonAttributes
 public:
     enum class ElementwiseOp
     {
-        ADD,          /**< (x + y) */
-        SUB,          /**< (x - y) */
-        DIV,          /**< (x / y) */
-        MIN,          /**< Min(x, y) */
-        MAX,          /**< Max(x, y) */
-        SQUARED_DIFF, /**< (x - y)^2 */
-        POWER,        /**< x ^ y */
-        PRELU,        /**< y*x if x < 0, x otherwise */
+        Add,         /**< (x + y) */
+        Sub,         /**< (x - y) */
+        Div,         /**< (x / y) */
+        Mul,         /**< (x * y) */
+        Min,         /**< Min(x, y) */
+        Max,         /**< Max(x, y) */
+        SquaredDiff, /**< (x - y)^2 */
+        Power,       /**< x ^ y */
+        Prelu,       /**< y*x if x < 0, x otherwise */
     };
     /** Set operation*/
     ElementwiseBinaryCommonAttributes &operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation);
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index 01017ed909..0dd7ca5e78 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
@@ -68,7 +68,7 @@ std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup
 
     code =
 R"_(
-    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
+    //------------------ START KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
 )_";
 
     if(is_root)
@@ -139,7 +139,7 @@ R"_(
     code +=
 R"_(
     }
-    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
+    //------------------ END KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
 )_";
 
     return code;
@@ -168,33 +168,34 @@ void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtab
 
 TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    TagLUT             lut{};
+    TagLUT lut{};
 
     // Local build options
     lut["meta_kernel_id"] = id();
     lut["DATA_TYPE"]      = get_cl_type_from_data_type(_lhs->data_type());
     // Arguments and global shared variables
 
-    lut["lhs"] = vtable.get_variable(_lhs);
-    lut["rhs"] = vtable.get_variable(_rhs);
-    lut["dst"] = vtable.get_variable(_dst);
+    lut["lhs"]     = vtable.get_variable(_lhs);
+    lut["rhs"]     = vtable.get_variable(_rhs);
+    lut["dst"]     = vtable.get_variable(_dst);
     lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor());
 
     switch(_attributes.operation())
     {
-        case Attributes::ElementwiseOp::ADD:
+        case Attributes::ElementwiseOp::Add:
             lut["ELTWISE_OP"] = "ADD";
             break;
+        case Attributes::ElementwiseOp::Mul:
+            lut["ELTWISE_OP"] = "MUL";
+            break;
         default:
             ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
     }
 
     ARM_COMPUTE_ERROR_ON(
-        comp_group.is_intermediate_tensor(_lhs) &&
-        detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
+        comp_group.is_intermediate_tensor(_lhs) && detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
     ARM_COMPUTE_ERROR_ON(
-        comp_group.is_intermediate_tensor(_rhs) &&
-        detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
+        comp_group.is_intermediate_tensor(_rhs) && detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
 
     // Set broadcast parameters
     // PRE: All tensors are broadcast-compatible
@@ -222,9 +223,9 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
     lut["rhs_m0"]          = (rhs_broadcast_yz) ? "1" : "M0";
     lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1";
 
-    lut["BROADCAST_OP"]    = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" :
-                             (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" :
-                                                  "";
+    lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" :
+                          (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" :
+                                               "";
 
     return lut;
 }
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
index 0385407ad2..52ba0520ad 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
@@ -34,7 +34,6 @@
 #include "tests/datasets/DynamicFusionDataset.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h"
-#include "tests/validation/reference/ElementwiseOperations.h"
 
 namespace arm_compute
 {
@@ -42,6 +41,14 @@ namespace test
 {
 namespace validation
 {
+/* Synced with tests/validation/CL/ArithmeticAddition.cpp from the standard interface.
+ *
+ * Difference | Why the difference
+ * No quantized tests  | Not supported yet
+ * No in place tests   | Not supported yet
+ * No activation tests | Not needed in dynamic fusion interface
+ *
+ */
 TEST_SUITE(CL)
 TEST_SUITE(DYNAMIC_FUSION)
 TEST_SUITE(ADD)
@@ -49,29 +56,33 @@ TEST_SUITE(ADD)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
-               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // S16 is valid data type for Add
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // S32 is valid data type for Add
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
                                                         TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32),    // Broadcast Y dimension is not allowed
                                                         TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::S16),    // Broadcast Z dimension is not allowed
                                                         TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed
                                                       }),
-               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8
                                                        TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for rhs
                                                        TensorInfo(TensorShape(15U,  1U, 3U), 1, DataType::F32),
                                                        TensorInfo(TensorShape( 3U,  8U, 1U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("Expected", { true, false, true, true, false, true, true, false, false, true})),
+               framework::dataset::make("Expected", { true, false, true, true, false, true, false, false, true, false, false, true})),
                input1_info, input2_info, expected)
 {
     // Create a new workload sketch
@@ -79,7 +90,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
     auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
     GpuWorkloadSketch sketch{ &gpu_ctx };
 
-    // Fuse Elementwise Add
+    // Validate Elementwise Add
     auto          lhs_info         = sketch.create_tensor_info(input1_info);
     auto          rhs_info         = sketch.create_tensor_info(input2_info);
 
@@ -89,59 +100,73 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-RelativeTolerance<float>            tolerance_f32(0.01f);                 /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.1)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
-constexpr float                     tolerance_num = 0.01f;                /**< Tolerance number */
+constexpr AbsoluteTolerance<float> tolerance_f16(0.0001f);  /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr AbsoluteTolerance<float> tolerance_f32(0.0001f);  /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+constexpr float                    tolerance_num = 0.0001f; /**< Tolerance number */
 
 template <typename T>
-using DynamicFusionAddOpFixture = DynamicFusionGpuElementwiseBinaryOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+using DynamicFusionCLAddFixture = DynamicFusionGpuElementwiseBinaryOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
 
 template <typename T>
-using DynamicFusionAddOpBroadcastFixture = DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+using DynamicFusionCLAddBroadcastFixture = DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
 
 template <typename T>
-using DynamicFusionGpuFuseTwoAddOpsFixture = DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+using DynamicFusionCLAddTwoOpsFixture = DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmallOneOp, DynamicFusionAddOpFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
-                                                                                                                       framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                                                                                                                       datasets::SmallShapesNoBatches()),
-                                                                                                                   framework::dataset::make("DataType", { DataType::F32 })),
-                                                                                                                   framework::dataset::make("InPlace", { false, true })))
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLAddFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLargeOneOp, DynamicFusionAddOpFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(
-                                                                                                                     framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                                                                                                                     datasets::LargeShapesNoBatches()),
-                                                                                                                 framework::dataset::make("DataType", { DataType::F32 })),
-                                                                                                                 framework::dataset::make("InPlace", { false, true })))
+FIXTURE_DATA_TEST_CASE(RunLargeOneOp,
+                       DynamicFusionCLAddFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp, DynamicFusionAddOpBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                       datasets::TemporaryLimitedSmallShapesBroadcast()),
-                       framework::dataset::make("DataType", { DataType::F32 })),
-                       framework::dataset::make("InPlace", { false, true })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLAddBroadcastFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp, DynamicFusionAddOpBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                       datasets::TemporaryLimitedLargeShapesBroadcast()),
-                       framework::dataset::make("DataType", { DataType::F32 })),
-                       framework::dataset::make("InPlace", { false, true })))
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLAddBroadcastFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                               datasets::TemporaryLimitedLargeShapesBroadcast()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmallTwoOps, DynamicFusionGpuFuseTwoAddOpsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                       datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes()),
-                       framework::dataset::make("DataType", { DataType::F32 })),
-                       framework::dataset::make("InPlace", { false })))
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionCLAddTwoOpsFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                                       datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes()),
+                                               framework::dataset::make("DataType", { DataType::F32 })),
+                                       framework::dataset::make("InPlace", { false })),
+                               framework::dataset::make("FuseTwoOps", { true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -149,19 +174,25 @@ FIXTURE_DATA_TEST_CASE(RunSmallTwoOps, DynamicFusionGpuFuseTwoAddOpsFixture<floa
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmallOneOp, DynamicFusionAddOpFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                                                                                                                    datasets::SmallShapesNoBatches()),
-                                                                                                                    framework::dataset::make("DataType", { DataType::F16 })),
-                                                                                                            framework::dataset::make("InPlace", { false, true })))
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLAddFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                               framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp, DynamicFusionAddOpBroadcastFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                       datasets::TemporaryLimitedSmallShapesBroadcast()),
-                       framework::dataset::make("DataType", { DataType::F16 })),
-                       framework::dataset::make("InPlace", { false })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLAddBroadcastFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                               framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num);
@@ -170,10 +201,13 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp, DynamicFusionAddOpBroadcastFixtur
 TEST_SUITE_END() // FP16
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionAddOpFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                                                                                                                        datasets::SmallShapesNoBatches()),
-                                                                                                                        framework::dataset::make("DataType", { DataType::S32 })),
-                                                                                                                framework::dataset::make("InPlace", { false })))
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLAddFixture<int32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", { DataType::S32 })),
+                               framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -181,18 +215,24 @@ FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionAddOpFixture<int32_t>, framework::
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionAddOpFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                                                                                                                        datasets::SmallShapesNoBatches()),
-                                                                                                                        framework::dataset::make("DataType", { DataType::S16 })),
-                                                                                                                framework::dataset::make("InPlace", { false })))
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLAddFixture<int16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", { DataType::S16 })),
+                               framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionAddOpFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                                                                                                                        datasets::LargeShapesNoBatches()),
-                                                                                                                        framework::dataset::make("DataType", { DataType::S16 })),
-                                                                                                                framework::dataset::make("InPlace", { false })))
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionCLAddFixture<int16_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", { DataType::S16 })),
+                               framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -200,10 +240,13 @@ FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionAddOpFixture<int16_t>, framework::
 TEST_SUITE_END() // S16
 
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionAddOpFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
-                                                                                                                        datasets::SmallShapesNoBatches()),
-                                                                                                                        framework::dataset::make("DataType", { DataType::U8 })),
-                                                                                                                framework::dataset::make("InPlace", { false })))
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLAddFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", { DataType::U8 })),
+                               framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp b/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
new file mode 100644
index 0000000000..a9e8f9c15f
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+
+#include "tests/datasets/DynamicFusionDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/validation/fixtures/dynamic_fusion/operators/MulFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/* Synced with tests/validation/CL/PixelwiseMultiplication.cpp from the standard interface.
+ *
+ * Difference              | Why the difference
+ * No integer tests        | Not supported yet
+ * No quantized tests      | Not supported yet
+ * No convert policy tests | Not needed as convert policy is ignored by floating types
+ * No scale tests          | Not supported yet
+ * No rounding modes tests | Not supported yet
+ * No in place tests       | Not supported yet
+ * No activation tests     | Not needed in dynamic fusion interface
+ *
+ */
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_f16(0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr AbsoluteTolerance<float> tolerance_f32(0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+} // namespace
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(MUL)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),     // Unsupported data type U8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),     // Unsupported data type S8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // Unsupported data type S16
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // Unsupported data type S32
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8_SIGNED
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
+                                                        TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32),    // Broadcast Y dimension is not allowed
+                                                        TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::F32),    // Broadcast Z dimension is not allowed
+                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed
+                                                      }),
+               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for rhs
+                                                       TensorInfo(TensorShape(15U,  1U, 3U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape( 3U,  8U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("Expected", { true, true, false, false, false, false, false, false, false, false, true, true, false, false, true })),
+               input1_info, input2_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Validate Elementwise Mul
+    auto          lhs_info         = sketch.create_tensor_info(input1_info);
+    auto          rhs_info         = sketch.create_tensor_info(input2_info);
+
+    bool res = bool(GpuMul::validate_op(sketch, &lhs_info, &rhs_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionCLMulFixture = DynamicFusionMulOneOpValidationFixture<CLTensor, CLAccessor, GpuMul, T>;
+template <typename T>
+using DynamicFusionCLMulBroadcastFixture = DynamicFusionMulBroadcastValidationFixture<CLTensor, CLAccessor, GpuMul, T>;
+template <typename T>
+using DynamicFusionCLMulTwoOpsFixture = DynamicFusionMulTwoOpsValidationFixture<CLTensor, CLAccessor, GpuMul, T>;
+
+TEST_SUITE(F16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLMulFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallShapes(),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::TemporaryLimitedSmallShapesBroadcast(),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::TemporaryLimitedLargeShapesBroadcast(),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // F16
+
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLMulFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallShapes(),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeOneOp,
+                       DynamicFusionCLMulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::LargeShapes(),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::TemporaryLimitedSmallShapesBroadcast(),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLMulBroadcastFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::TemporaryLimitedLargeShapesBroadcast(),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionCLMulTwoOpsFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes(),
+                                               framework::dataset::make("DataType", { DataType::F32 })),
+                                       framework::dataset::make("InPlace", { false })),
+                               framework::dataset::make("FuseTwoOps", { true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // F32
+
+TEST_SUITE_END() // MUL
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h
index faed610874..b0680c0e4a 100644
--- a/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h
@@ -31,12 +31,9 @@
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
 
-#include "tests/CL/CLAccessor.h"
 #include "tests/framework/Fixture.h"
 #include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
 #include "tests/validation/reference/ElementwiseOperations.h"
-#include "tests/validation/reference/Permute.h"
 
 using namespace arm_compute::experimental::dynamic_fusion;
 
@@ -51,12 +48,13 @@ class DynamicFusionGpuElementwiseBinaryValidationGenericFixture : public framewo
 {
 public:
     template <typename...>
-    void setup(ArithmeticOperation op, TensorShape shape0, TensorShape shape1, TensorShape shape2, const DataType data_type, const bool is_inplace)
+    void setup(ArithmeticOperation ref_op, const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2, DataType data_type, bool is_inplace, bool fuse_two_ops = false)
     {
-        _op         = op;
+        _ref_op         = ref_op;
         _is_inplace = is_inplace;
         _data_type  = data_type;
-        _fuse       = shape2.total_size() != 0;
+        _fuse       = fuse_two_ops;
+        ARM_COMPUTE_ERROR_ON_MSG(_fuse && shape2.total_size() == 0, "No shape2 provided for fusion of two ops.");
         ARM_COMPUTE_ERROR_ON_MSG(_fuse && _is_inplace, "In place for fusing case not supported yet.");
         _target    = compute_target(shape0, shape1, shape2);
         _reference = compute_reference(shape0, shape1, shape2);
@@ -68,7 +66,7 @@ protected:
     {
         if(is_data_type_float(tensor.data_type()))
         {
-            switch(_op)
+            switch(_ref_op)
             {
                 case ArithmeticOperation::DIV:
                     library->fill_tensor_uniform_ranged(tensor, i, { std::pair<float, float>(-0.001f, 0.001f) });
@@ -82,7 +80,7 @@ protected:
         }
         else if(tensor.data_type() == DataType::S32)
         {
-            switch(_op)
+            switch(_ref_op)
             {
                 case ArithmeticOperation::DIV:
                     library->fill_tensor_uniform_ranged(tensor, i, { std::pair<int32_t, int32_t>(-1U, 1U) });
@@ -97,7 +95,7 @@ protected:
         }
     }
 
-    TensorType compute_target(TensorShape shape0, TensorShape shape1, TensorShape shape2)
+    TensorType compute_target(const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2)
     {
         // Create a new workload sketch
         auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
@@ -105,7 +103,7 @@ protected:
         GpuWorkloadSketch sketch{ &gpu_ctx };
 
         // Fuse first element wise binary Op
-        TensorInfo lhs_info = sketch.create_tensor_info(shape0, 1, _data_type);
+        TensorInfo lhs_info = sketch.create_tensor_info(TensorInfo(shape0, 1, _data_type));
         TensorInfo rhs_info = sketch.create_tensor_info(TensorInfo(shape1, 1, _data_type));
         TensorInfo dst_info = sketch.create_tensor_info();
 
@@ -115,7 +113,7 @@ protected:
 
         if(_fuse)
         {
-            rhs_info_fuse          = sketch.create_tensor_info(shape2, 1, _data_type);
+            rhs_info_fuse          = sketch.create_tensor_info(TensorInfo(shape2, 1, _data_type));
             ITensorInfo *ans2_info = FunctionType::create_op(sketch, ans_info, &rhs_info_fuse);
             GpuOutput::create_op(sketch, ans2_info, &dst_info);
         }
@@ -183,7 +181,7 @@ protected:
         return t_dst;
     }
 
-    SimpleTensor<T> compute_reference(TensorShape shape0, TensorShape shape1, TensorShape shape2)
+    SimpleTensor<T> compute_reference(const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2)
     {
         const TensorShape out_shape      = TensorShape::broadcast_shape(shape0, shape1);
         const TensorShape out_shape_fuse = TensorShape::broadcast_shape(out_shape, shape1);
@@ -194,21 +192,22 @@ protected:
         SimpleTensor<T> ref_rhs_fuse{ shape2, _data_type, 1, QuantizationInfo() };
         SimpleTensor<T> ref_dst{ out_shape, _data_type, 1, QuantizationInfo() };
         SimpleTensor<T> ref_dst_fuse{ out_shape_fuse, _data_type, 1, QuantizationInfo() };
+
         // Fill reference
         fill(ref_lhs, 0);
         fill(ref_rhs, 1);
 
-        reference::arithmetic_operation<T>(_op, ref_lhs, ref_rhs, ref_dst, ConvertPolicy::WRAP);
+        reference::arithmetic_operation<T>(_ref_op, ref_lhs, ref_rhs, ref_dst, ConvertPolicy::WRAP);
         if(_fuse)
         {
             fill(ref_rhs_fuse, 2);
-            reference::arithmetic_operation<T>(_op, ref_dst, ref_rhs_fuse, ref_dst_fuse, ConvertPolicy::WRAP);
+            reference::arithmetic_operation<T>(_ref_op, ref_dst, ref_rhs_fuse, ref_dst_fuse, ConvertPolicy::WRAP);
         }
         SimpleTensor<T> *ret = _fuse ? &ref_dst_fuse : &ref_dst;
         return *ret;
     }
 
-    ArithmeticOperation _op{ ArithmeticOperation::ADD };
+    ArithmeticOperation _ref_op{ ArithmeticOperation::ADD };
     TensorType          _target{};
     SimpleTensor<T>     _reference{};
     DataType            _data_type{};
@@ -222,9 +221,9 @@ class DynamicFusionGpuElementwiseBinaryOneOpValidationFixture : public DynamicFu
 {
 public:
     template <typename...>
-    void setup(ArithmeticOperation op, TensorShape shape, const DataType data_type, const bool is_inplace)
+    void setup(ArithmeticOperation ref_op, const TensorShape &shape0, DataType data_type, bool is_inplace)
     {
-        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape, shape, TensorShape(), data_type, is_inplace);
+        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ref_op, shape0, shape0, TensorShape(), data_type, is_inplace);
     }
 };
 
@@ -233,9 +232,9 @@ class DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture : public
 {
 public:
     template <typename...>
-    void setup(ArithmeticOperation op, TensorShape shape0, TensorShape shape1, const DataType data_type, const bool is_inplace)
+    void setup(ArithmeticOperation ref_op, const TensorShape &shape0, const TensorShape &shape1, DataType data_type, bool is_inplace)
     {
-        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape0, shape1, TensorShape(), data_type, is_inplace);
+        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ref_op, shape0, shape1, TensorShape(), data_type, is_inplace);
     }
 };
 
@@ -244,9 +243,9 @@ class DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture : public DynamicF
 {
 public:
     template <typename...>
-    void setup(ArithmeticOperation op, TensorShape shape0, TensorShape shape1, TensorShape shape2, const DataType data_type, const bool is_inplace)
+    void setup(ArithmeticOperation ref_op, const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2, DataType data_type, bool is_inplace, bool fuse_two_ops)
     {
-        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape0, shape1, shape2, data_type, is_inplace);
+        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ref_op, shape0, shape1, shape2, data_type, is_inplace, fuse_two_ops);
     }
 };
 
diff --git a/tests/validation/fixtures/dynamic_fusion/operators/MulFixture.h b/tests/validation/fixtures/dynamic_fusion/operators/MulFixture.h
new file mode 100644
index 0000000000..0530707c38
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/operators/MulFixture.h
@@ -0,0 +1,235 @@
+/*
+* Copyright (c) 2023 Arm Limited.
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_MULFIXTURE
+#define TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_MULFIXTURE
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/Globals.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/reference/PixelWiseMultiplication.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/* We use a separate test fixture for Multiplication op instead of reusing ElementwiseBinaryFixture to avoid exposing
+ * the internal enum ElementwiseOp to the public utils/TypePrinters.h as required by the data test case macros
+ * to print the test data.
+ */
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionMulValidationFixture : public framework::Fixture
+{
+public:
+   template <typename...>
+   void setup(const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2, DataType data_type, bool is_inplace, bool fuse_two_ops = false)
+   {
+       _data_type  = data_type;
+       _is_inplace = is_inplace;
+       _fuse       = fuse_two_ops;
+       ARM_COMPUTE_ERROR_ON_MSG(_fuse && shape2.total_size() == 0, "No shape2 provided for fusion of two ops.");
+       ARM_COMPUTE_ERROR_ON_MSG(_fuse && _is_inplace, "In place for fusing case not supported yet.");
+       _target    = compute_target(shape0, shape1, shape2);
+       _reference = compute_reference(shape0, shape1, shape2);
+   }
+
+protected:
+   template <typename U>
+   void fill(U &&tensor, int i)
+   {
+       library->fill_tensor_uniform(tensor, i);
+   }
+
+   TensorType compute_target(const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2)
+   {
+       // Create a new workload sketch
+       auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+       auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+       GpuWorkloadSketch sketch{ &gpu_ctx };
+
+       // Fuse first multiplication op
+       TensorInfo lhs_info = sketch.create_tensor_info(TensorInfo(shape0, 1, _data_type));
+       TensorInfo rhs_info = sketch.create_tensor_info(TensorInfo(shape1, 1, _data_type));
+       TensorInfo dst_info = sketch.create_tensor_info();
+
+       TensorInfo rhs_info_fuse;
+
+       ITensorInfo *ans_info = FunctionType::create_op(sketch, &lhs_info, &rhs_info);
+
+       if(_fuse)
+       {
+           rhs_info_fuse          = sketch.create_tensor_info(TensorInfo(shape2, 1, _data_type));
+           ITensorInfo *ans2_info = FunctionType::create_op(sketch, ans_info, &rhs_info_fuse);
+           GpuOutput::create_op(sketch, ans2_info, &dst_info);
+       }
+       else
+       {
+           GpuOutput::create_op(sketch, ans_info, &dst_info);
+       }
+
+       // Configure runtime
+       ClWorkloadRuntime runtime;
+       runtime.configure(sketch);
+
+       // (Important) Allocate auxiliary tensor memory if there are any
+       for(auto &data : runtime.get_auxiliary_tensors())
+       {
+           CLTensor     *tensor      = std::get<0>(data);
+           TensorInfo    info        = std::get<1>(data);
+           AuxMemoryInfo aux_mem_req = std::get<2>(data);
+           tensor->allocator()->init(info, aux_mem_req.alignment);
+           tensor->allocator()->allocate(); // Use ACL allocated memory
+       }
+
+       // Construct user tensors
+       TensorType t_lhs{};
+       TensorType t_rhs{};
+       TensorType t_rhs_fuse{};
+       TensorType t_dst{};
+
+       // Initialize user tensors
+       t_lhs.allocator()->init(lhs_info);
+       t_rhs.allocator()->init(rhs_info);
+       t_dst.allocator()->init(dst_info);
+       if(_fuse)
+       {
+           t_rhs_fuse.allocator()->init(rhs_info_fuse);
+       }
+
+       // Allocate and fill user tensors
+       // Instead of using ACL allocator, the user can choose to import memory into the tensors
+       t_lhs.allocator()->allocate();
+       t_rhs.allocator()->allocate();
+       t_dst.allocator()->allocate();
+       if(_fuse)
+       {
+           t_rhs_fuse.allocator()->allocate();
+       }
+
+       fill(AccessorType(t_lhs), 0);
+       fill(AccessorType(t_rhs), 1);
+       if(_fuse)
+       {
+           fill(AccessorType(t_rhs_fuse), 2);
+       }
+
+       // Run runtime
+       if(_fuse)
+       {
+           runtime.run({ &t_lhs, &t_rhs, &t_rhs_fuse, &t_dst });
+       }
+       else
+       {
+           runtime.run({ &t_lhs, &t_rhs, &t_dst });
+       }
+
+       return t_dst;
+   }
+
+   SimpleTensor<T> compute_reference(const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2)
+   {
+       // Create reference
+       SimpleTensor<T> ref_lhs{ shape0, _data_type, 1, QuantizationInfo() };
+       SimpleTensor<T> ref_rhs{ shape1, _data_type, 1, QuantizationInfo() };
+       SimpleTensor<T> ref_rhs_fuse{ shape2, _data_type, 1, QuantizationInfo() };
+
+       // Fill reference
+       fill(ref_lhs, 0);
+       fill(ref_rhs, 1);
+       SimpleTensor<T> ref_dst = reference::pixel_wise_multiplication<T, T, T>(ref_lhs,
+                                                                               ref_rhs,
+                                                                               1.f,
+                                                                               ConvertPolicy::SATURATE,
+                                                                               RoundingPolicy::TO_NEAREST_UP,
+                                                                               _data_type,
+                                                                               QuantizationInfo());
+       if(_fuse)
+       {
+           fill(ref_rhs_fuse, 2);
+           SimpleTensor<T> ref_dst_fuse = reference::pixel_wise_multiplication<T, T, T>(ref_dst,
+                                                                                        ref_rhs_fuse,
+                                                                                        1.f,
+                                                                                        ConvertPolicy::SATURATE,
+                                                                                        RoundingPolicy::TO_NEAREST_UP,
+                                                                                        _data_type,
+                                                                                        QuantizationInfo());
+           return ref_dst_fuse;
+       }
+       return ref_dst;
+   }
+
+   TensorType      _target{};
+   SimpleTensor<T> _reference{};
+   DataType        _data_type{};
+   bool            _is_inplace{ false };
+   bool            _fuse{ false };
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionMulOneOpValidationFixture : public DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+   template <typename...>
+   void setup(const TensorShape &shape0, DataType data_type, bool is_inplace)
+   {
+       DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape0, shape0, TensorShape(), data_type, is_inplace);
+   }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionMulBroadcastValidationFixture : public DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+   template <typename...>
+   void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type, bool is_inplace)
+   {
+       DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape0, shape1, TensorShape(), data_type, is_inplace);
+   }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionMulTwoOpsValidationFixture : public DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+   template <typename...>
+   void setup(const TensorShape &shape0, const TensorShape &shape1, const TensorShape &shape2, DataType data_type, bool is_inplace, bool fuse_two_ops)
+   {
+       DynamicFusionMulValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape0, shape1, shape2, data_type, is_inplace, fuse_two_ops);
+   }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_OPERATORS_MULFIXTURE */
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 61f0eb9cec..448f184432 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -421,8 +421,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMLHSMatrixInfo &g
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const GEMMRHSMatrixInfo &gemm_info)
 {
-    os << "( n0=" << (unsigned int)gemm_info.n0 << " k0=" << gemm_info.k0 << "  h0=" << gemm_info.h0 << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave << " exp_img=" <<
-       gemm_info.export_to_cl_image << "})";
+    os << "( n0=" << (unsigned int)gemm_info.n0 << " k0=" << gemm_info.k0 << "  h0=" << gemm_info.h0 << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave << " exp_img=" << gemm_info.export_to_cl_image << "})";
     return os;
 }
 
@@ -475,8 +474,7 @@ inline std::string to_string(const GEMMKernelInfo &gemm_info)
 inline ::std::ostream &operator<<(::std::ostream &os, const BoundingBoxTransformInfo &bbox_info)
 {
     auto weights = bbox_info.weights();
-    os << "(" << bbox_info.img_width() << "x" << bbox_info.img_height() << ")~" << bbox_info.scale() << "(weights={" << weights[0] << ", " << weights[1] << ", " << weights[2] << ", " << weights[3] <<
-       "})";
+    os << "(" << bbox_info.img_width() << "x" << bbox_info.img_height() << ")~" << bbox_info.scale() << "(weights={" << weights[0] << ", " << weights[1] << ", " << weights[2] << ", " << weights[3] << "})";
     return os;
 }
 
@@ -3305,46 +3303,46 @@ inline std::string to_string(const Conv3dInfo &conv3d_info)
 inline std::string to_string(const WeightFormat wf)
 {
 #define __CASE_WEIGHT_FORMAT(wf) \
-case WeightFormat::wf:       \
-    return #wf;
+    case WeightFormat::wf:       \
+        return #wf;
     switch(wf)
     {
-            __CASE_WEIGHT_FORMAT(UNSPECIFIED)
-            __CASE_WEIGHT_FORMAT(ANY)
-            __CASE_WEIGHT_FORMAT(OHWI)
-            __CASE_WEIGHT_FORMAT(OHWIo2)
-            __CASE_WEIGHT_FORMAT(OHWIo4)
-            __CASE_WEIGHT_FORMAT(OHWIo8)
-            __CASE_WEIGHT_FORMAT(OHWIo16)
-            __CASE_WEIGHT_FORMAT(OHWIo32)
-            __CASE_WEIGHT_FORMAT(OHWIo64)
-            __CASE_WEIGHT_FORMAT(OHWIo128)
-            __CASE_WEIGHT_FORMAT(OHWIo4i2)
-            __CASE_WEIGHT_FORMAT(OHWIo4i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo8i2)
-            __CASE_WEIGHT_FORMAT(OHWIo8i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo16i2)
-            __CASE_WEIGHT_FORMAT(OHWIo16i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo32i2)
-            __CASE_WEIGHT_FORMAT(OHWIo32i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo64i2)
-            __CASE_WEIGHT_FORMAT(OHWIo64i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo4i4)
-            __CASE_WEIGHT_FORMAT(OHWIo4i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo8i4)
-            __CASE_WEIGHT_FORMAT(OHWIo8i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo16i4)
-            __CASE_WEIGHT_FORMAT(OHWIo16i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo32i4)
-            __CASE_WEIGHT_FORMAT(OHWIo32i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo64i4)
-            __CASE_WEIGHT_FORMAT(OHWIo64i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo2i8)
-            __CASE_WEIGHT_FORMAT(OHWIo4i8)
-            __CASE_WEIGHT_FORMAT(OHWIo8i8)
-            __CASE_WEIGHT_FORMAT(OHWIo16i8)
-            __CASE_WEIGHT_FORMAT(OHWIo32i8)
-            __CASE_WEIGHT_FORMAT(OHWIo64i8)
+        __CASE_WEIGHT_FORMAT(UNSPECIFIED)
+        __CASE_WEIGHT_FORMAT(ANY)
+        __CASE_WEIGHT_FORMAT(OHWI)
+        __CASE_WEIGHT_FORMAT(OHWIo2)
+        __CASE_WEIGHT_FORMAT(OHWIo4)
+        __CASE_WEIGHT_FORMAT(OHWIo8)
+        __CASE_WEIGHT_FORMAT(OHWIo16)
+        __CASE_WEIGHT_FORMAT(OHWIo32)
+        __CASE_WEIGHT_FORMAT(OHWIo64)
+        __CASE_WEIGHT_FORMAT(OHWIo128)
+        __CASE_WEIGHT_FORMAT(OHWIo4i2)
+        __CASE_WEIGHT_FORMAT(OHWIo4i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo8i2)
+        __CASE_WEIGHT_FORMAT(OHWIo8i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo16i2)
+        __CASE_WEIGHT_FORMAT(OHWIo16i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo32i2)
+        __CASE_WEIGHT_FORMAT(OHWIo32i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo64i2)
+        __CASE_WEIGHT_FORMAT(OHWIo64i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo4i4)
+        __CASE_WEIGHT_FORMAT(OHWIo4i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo8i4)
+        __CASE_WEIGHT_FORMAT(OHWIo8i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo16i4)
+        __CASE_WEIGHT_FORMAT(OHWIo16i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo32i4)
+        __CASE_WEIGHT_FORMAT(OHWIo32i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo64i4)
+        __CASE_WEIGHT_FORMAT(OHWIo64i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo2i8)
+        __CASE_WEIGHT_FORMAT(OHWIo4i8)
+        __CASE_WEIGHT_FORMAT(OHWIo8i8)
+        __CASE_WEIGHT_FORMAT(OHWIo16i8)
+        __CASE_WEIGHT_FORMAT(OHWIo32i8)
+        __CASE_WEIGHT_FORMAT(OHWIo64i8)
         default:
             return "invalid value";
     }
@@ -3629,7 +3627,7 @@ inline std::string to_string(const experimental::dynamic_fusion::ResizeAttribute
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const experimental::dynamic_fusion::SoftmaxAttributes &softmax_attr)
 {
-    os << "SofmtaxAttributes="
+    os << "SoftmaxAttributes="
        << "["
        << "Beta=" << softmax_attr.beta() << ", "
        << "Is Log Softmax=" << softmax_attr.is_log_softmax() << ", "