Implement new Elementwise Dynamic Fusion Operators: Div, Floor

Resolves: COMPMID-5355 Change-Id: I92f73fbe885f28bbe7b07965b90cfd807c93602f Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7745 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com>
author: Michalis Spyrou <michalis.spyrou@arm.com> 2022-06-15 19:02:28 +0100
committer: Michalis Spyrou <michalis.spyrou@arm.com> 2022-06-27 14:05:05 +0000
commit: b1fcefddf3f59219a9d7930d607175b7e6c39347 (patch)
tree: 34e95efded15194b3c8abe4ba3da308c3259301d /src
parent: 41eb2d92c89274200d59ff97653e2bd66819b310 (diff)
download: ComputeLibrary-b1fcefddf3f59219a9d7930d607175b7e6c39347.tar.gz
21 files changed, 530 insertions, 87 deletions
diff --git a/src/core/CL/cl_kernels/common/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl
index 9fad66df68..53ce296948 100644
--- a/src/core/CL/cl_kernels/common/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/common/gemmlowp.cl
@@ -703,7 +703,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t
 
     a_offset_s32[0].v *= A_OFFSET;
 
-    T_ADD_BROADCAST_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
 #endif // defined(A_OFFSET)
 
 #if defined(B_OFFSET)
@@ -728,7 +728,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t
 
     T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias);
 
-    T_ADD_BROADCAST_X(int, M0, N0, offset_s32, bias, offset_s32);
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
 #endif // defined(ADD_BIAS)
 
     LOOP_UNROLLING(int, i, 0, 1, M0,
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index f1b422a68f..d34e24b436 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -233,7 +233,7 @@ __kernel void direct_convolution_nhwc(
     T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
 
 #endif // HAS_BIAS
 
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
index 587f3984ab..807b990e82 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -254,7 +254,7 @@ __kernel void direct_convolution3d_ndhwc(
     }
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
 
 #endif // HAS_BIAS
 
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
index 4f57a81e7b..b24a6ae85f 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -175,7 +175,7 @@ __kernel void dwc_native_fp_nhwc(
         T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, (cout * DEPTH_MULTIPLIER) + d, 0, 0, 0, bias0);
 
         // c = c + bias[broadcasted]
-        T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+        T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
 #endif // HAS_BIAS
 
         T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
index ec2593af71..263a23ef28 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -236,7 +236,7 @@ __kernel void dwc_native_quantized_nhwc(
         T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout * DEPTH_MULTIPLIER + d, 0, 0, 0, bias0);
 
         // c = c + bias[broadcasted]
-        T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+        T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
 #endif // HAS_BIAS
 
         T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
index bab2ee850c..0883cd99c8 100644
--- a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
@@ -111,7 +111,7 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc(
 
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
-    T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 2, N0, out, b, out);
 #endif // defined(HAS_BIAS)
 
     T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
@@ -177,7 +177,7 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc(
 
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
 #endif // defined(HAS_BIAS)
 
     T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
@@ -287,7 +287,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
 #endif // HAS_BIAS
 
     int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
@@ -374,7 +374,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
 #endif // HAS_BIAS
 
     int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
@@ -488,7 +488,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc(
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
 #endif // HAS_BIAS
 
     int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
@@ -586,7 +586,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc(
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
 #endif // HAS_BIAS
 
     int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index 69898e1bb4..0ce343e3ec 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -970,6 +970,9 @@
 #define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
 #define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
 
+#define T_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
+#define T_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
+
 /** Element-wise activation for quantized types
  *
  * @note Performs: activation(LHS) = DST
@@ -1014,11 +1017,15 @@
         })                                                        \
     })
 
-/** Element-wise addition with RHS broadcasted (RHS has the X dimension only)
+#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(T_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(T_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+/** Element-wise operation with RHS broadcasted (RHS has the X dimension only)
  *
- * @note Performs: LHS + RHS[broadcasted] = DST
+ * @note Performs: LHS OP RHS[broadcasted] = DST
  * @note Both tiles must have same data type
  *
+ * @param[in]  T_ELWISE_OP   Elementwise operator to perform
  * @param[in]  DST_DATA_TYPE DST data type
  * @param[in]  M0            Number of LHS rows
  * @param[in]  N0            Number of LHS columns
@@ -1026,19 +1033,23 @@
  * @param[in]  rhs           RHS tile
  * @param[out] dst           DST tile
  */
-#define T_ADD_BROADCAST_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
     ({                                                      \
         LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
         {                                                   \
-            dst[_m0].v = CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)) + CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0));             \
+            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
         })                                                  \
     })
 
-/** Element-wise addition between two tiles (LHS and RHS)
+#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(T_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(T_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+/** Element-wise operation between two tiles (LHS and RHS)
  *
- * @note Performs: LHS + RHS = DST
+ * @note Performs: LHS OP RHS = DST
  * @note Both tiles must have same data type
  *
+ * @param[in]  T_ELWISE_OP   Elementwise operator to perform
  * @param[in]  DST_DATA_TYPE DST data type
  * @param[in]  M0            Number of LHS rows
  * @param[in]  N0            Number of LHS columns
@@ -1046,11 +1057,30 @@
  * @param[in]  rhs           RHS tile
  * @param[out] dst           DST tile
  */
-#define T_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+    ({                                                      \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
+        {                                                   \
+            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
+        })                                                  \
+    })
+
+/** Floor operation on a tile
+ *
+ * @note Performs: floor(SRC) = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of SRC rows
+ * @param[in]  N0            Number of SRC columns
+ * @param[in]  src           LHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
     ({                                                      \
         LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
         {                                                   \
-            dst[_m0].v = CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)) + CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0));             \
+            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
         })                                                  \
     })
 
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
index 0be2ba02b5..9b6daae619 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
@@ -55,18 +55,32 @@ Status add_tensor(ClKernelBlueprint &kernel_blueprint, ITensorInfo *tensor_info,
     return Status{};
 }
 
-Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClEltwiseAddKernelDescriptor &,
-                             ArgumentID src0_id, ArgumentID src1_id, ArgumentID &dst_id)
+Status add_kcomp_eltwise_op(ClKernelBlueprint &kernel_blueprint, const ClElementwiseKernelDescriptor &desc,
+                            ArgumentID src0_id, ArgumentID src1_id, ArgumentID &dst_id)
 {
     kernel_blueprint.impl().add_component(
-        std::make_unique<ClElementwiseAddKernelComponent>(
+        std::make_unique<ClElementwiseKernelComponent>(
             &kernel_blueprint,
+            desc,
             SharedVarLink{ src0_id, SharedVarIO::Input },
             SharedVarLink{ src1_id, SharedVarIO::Input },
             SharedVarLink{ dst_id, SharedVarIO::Output }));
 
     return Status{};
 }
+
+Status add_kcomp_floor(ClKernelBlueprint &kernel_blueprint, const ClFloorKernelDescriptor &,
+                       ArgumentID src_id, ArgumentID &dst_id)
+{
+    kernel_blueprint.impl().add_component(
+        std::make_unique<ClFloorKernelComponent>(
+            &kernel_blueprint,
+            SharedVarLink{ src_id, SharedVarIO::Input },
+            SharedVarLink{ dst_id, SharedVarIO::Output }));
+
+    return Status{};
+}
+
 Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID, ArgumentID &)
 {
     return Status{};
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
index 067e9737e3..463fc5e7cf 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
@@ -59,9 +59,13 @@ public:
 };
 
 ///// Kernel Components /////
-/** Component: Eltwise Add */
-Status add_kcomp_eltwise_add(ClKernelBlueprint &, const ClEltwiseAddKernelDescriptor &, ArgumentID src0_id,
-                             ArgumentID src1_id, ArgumentID &dst_id);
+/** Component: Eltwise Operator */
+Status add_kcomp_eltwise_op(ClKernelBlueprint &, const ClElementwiseKernelDescriptor &, ArgumentID src0_id,
+                            ArgumentID src1_id, ArgumentID &dst_id);
+
+/** Component: Floor */
+Status add_kcomp_floor(ClKernelBlueprint &, const ClFloorKernelDescriptor &, ArgumentID src_id,
+                       ArgumentID &dst_id);
 
 /** Component: Activation */
 Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID src_id, ArgumentID &dst_id);
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
index b63e2167b7..811cd79811 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
@@ -237,7 +237,7 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const
             T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, cout, 0, 1, 0, bias0);
 
             // c = c + bias[broadcasted]
-            T_ADD_BROADCAST_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
+            T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
         )_";
     }
 
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp
index 965a68f51d..24a9eee9a3 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp
@@ -23,7 +23,7 @@
  */
 #ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
 
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -34,17 +34,17 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-ComponentType ClElementwiseAddKernelComponent::get_component_type() const
+ComponentType ClElementwiseKernelComponent::get_component_type() const
 {
     return ComponentType::Simple;
 }
 
-std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const
+std::set<std::string> ClElementwiseKernelComponent::get_headers_list() const
 {
     return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" };
 }
 
-Window ClElementwiseAddKernelComponent::get_window() const
+Window ClElementwiseKernelComponent::get_window() const
 {
     const ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
     const ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
@@ -64,7 +64,7 @@ Window ClElementwiseAddKernelComponent::get_window() const
     return win;
 }
 
-std::string ClElementwiseAddKernelComponent::get_component_code() const
+std::string ClElementwiseKernelComponent::get_component_code() const
 {
     std::string code;
     const bool  is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
@@ -72,7 +72,7 @@ std::string ClElementwiseAddKernelComponent::get_component_code() const
     if(is_root)
     {
         return R"_(
-    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
     // IN_0(LHS)            {{lhs}}
     // IN_1(RHS)            {{rhs}}
     // OUT(dst, accum)      {{dst}}
@@ -87,19 +87,19 @@ std::string ClElementwiseAddKernelComponent::get_component_code() const
         T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{rhs}}, cout, mout, 1, {{rhs}}_stride_y, rhs_tile);
 
 #if defined(IS_BROADCAST)
-        T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+        T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
 #else // !defined(IS_BROADCAST)
-        T_ADD({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+        T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
 #endif // defined(IS_BROADCAST)
 
     }
-    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
 )_";
     }
     else
     {
         return R"_(
-    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
     // IN_0/Out(Accumulator)   {{acc}}
     // IN_1(Addend)        {{addend}}
 
@@ -110,17 +110,17 @@ std::string ClElementwiseAddKernelComponent::get_component_code() const
         T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{addend}}, cout, mout, 1, {{addend}}_stride_y, addend_tile);
 
 #if defined(IS_BROADCAST)
-        T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
+        T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
 #else // !defined(IS_BROADCAST)
-        T_ADD({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
+        T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
 #endif // defined(IS_BROADCAST)
     }
-    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
 )_";
     }
 }
 
-CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
+CLBuildOptions ClElementwiseKernelComponent::generate_build_options() const
 {
     const auto t_src_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
     const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
@@ -137,7 +137,7 @@ CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
     return build_opts;
 }
 
-std::string ClElementwiseAddKernelComponent::generate_config_id() const
+std::string ClElementwiseKernelComponent::generate_config_id() const
 {
     auto        t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
     std::string config_id{};
@@ -151,7 +151,7 @@ std::string ClElementwiseAddKernelComponent::generate_config_id() const
     return config_id;
 }
 
-void ClElementwiseAddKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+void ClElementwiseKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
 {
     const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
     vtable.add(_lhs, _blueprint->impl().group(_lhs.arg_id), ClKernelArgDescriptor(_lhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "lhs");
@@ -162,7 +162,7 @@ void ClElementwiseAddKernelComponent::allocate_shared_vars(SharedVarTable &vtabl
     }
 }
 
-ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
+ClElementwiseKernelComponent::TagLUT ClElementwiseKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
 {
     TagLUT     lut{};
     const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
@@ -199,6 +199,18 @@ ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::get_tag
     // Local build options
     lut["meta_kernel_id"] = id();
     lut["DATA_TYPE"]      = get_cl_type_from_data_type(t_dst_info->data_type());
+
+    switch(_desc.eltwise.op)
+    {
+        case ArithmeticOperation::DIV:
+            lut["ELTWISE_OP"] = "DIV";
+            break;
+        case ArithmeticOperation::ADD:
+            lut["ELTWISE_OP"] = "ADD";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
+    }
     return lut;
 }
 } // namespace dynamic_fusion
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h
index 5f8b1569ac..91b14ffafa 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h
@@ -34,11 +34,11 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-class ClElementwiseAddKernelComponent : public IClKernelComponent
+class ClElementwiseKernelComponent : public IClKernelComponent
 {
 public:
-    ClElementwiseAddKernelComponent(ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
-        : IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
+    ClElementwiseKernelComponent(ClKernelBlueprint *blueprint, const ClElementwiseKernelDescriptor &desc, const Link &lhs, const Link &rhs, const Link &dst)
+        : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
     {
     }
 
@@ -63,9 +63,10 @@ public:
     }
 
 private:
-    Link _lhs{};
-    Link _rhs{};
-    Link _dst{};
+    ClElementwiseKernelDescriptor _desc{};
+    Link                          _lhs{};
+    Link                          _rhs{};
+    Link                          _dst{};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp
new file mode 100644
index 0000000000..87cc110561
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ComponentType ClFloorKernelComponent::get_component_type() const
+{
+    return ComponentType::Simple;
+}
+
+std::set<std::string> ClFloorKernelComponent::get_headers_list() const
+{
+    return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" };
+}
+
+Window ClFloorKernelComponent::get_window() const
+{
+    const ITensorInfo *src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
+    ITensorInfo       *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src_info, dst_info);
+    auto_init_if_empty(*dst_info, src_info->tensor_shape(), 1, src_info->data_type());
+
+    const unsigned int vector_size_byte_opencl           = 16;
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0));
+    Window             win                               = calculate_max_window(*dst_info, Steps(num_elems_processed_per_iteration));
+
+    return win;
+}
+
+std::string ClFloorKernelComponent::get_component_code() const
+{
+    return R"_(
+    //------------------ START KERNEL {{meta_kernel_id}} FLOOR ---------------------
+    // IN_0(src)            {{src}}
+    // OUT(dst, accum)      {{dst}}
+    TILE({{DATA_TYPE}}, M0, N0, {{dst}});
+    {
+        TILE({{DATA_TYPE}}, M0, N0, src_tile);
+
+        T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{src}}, cout, mout, 1, {{src}}_stride_y, src_tile);
+        T_FLOOR({{DATA_TYPE}}, M0, N0, src_tile, {{dst}});
+    }
+
+    //------------------ END KERNEL {{meta_kernel_id}} FLOOR ---------------------
+)_";
+}
+
+CLBuildOptions ClFloorKernelComponent::generate_build_options() const
+{
+    CLBuildOptions build_opts{};
+
+    const auto n0 = _blueprint->impl().get_execution_window().x().step();
+    const auto m0 = _blueprint->impl().get_execution_window().y().step();
+
+    const auto         dst_info         = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    const unsigned int partial_store_n0 = dst_info->dimension(0) % n0;
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    return build_opts;
+}
+
+std::string ClFloorKernelComponent::generate_config_id() const
+{
+    auto        t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    std::string config_id{};
+    config_id += lower_string(string_from_data_type(t_dst_info->data_type()));
+    config_id += "_";
+    config_id += support::cpp11::to_string(t_dst_info->dimension(0));
+    config_id += "_";
+    config_id += support::cpp11::to_string(t_dst_info->dimension(1));
+    config_id += "_";
+    config_id += lower_string(string_from_data_layout(t_dst_info->data_layout()));
+    return config_id;
+}
+
+void ClFloorKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+    vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
+    vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
+}
+
+ClFloorKernelComponent::TagLUT ClFloorKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
+{
+    TagLUT     lut{};
+    const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    // Arguments and global shared variables
+    lut["src"]            = vtable.get(_src);
+    lut["dst"]            = vtable.get(_dst);
+    lut["meta_kernel_id"] = id();
+    lut["DATA_TYPE"]      = get_cl_type_from_data_type(t_dst_info->data_type());
+    return lut;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+\ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h
new file mode 100644
index 0000000000..5463e233d4
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H
+
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClFloorKernelComponent : public IClKernelComponent
+{
+public:
+    ClFloorKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+        : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
+    {
+    }
+
+    ComponentType         get_component_type() const override;
+    std::set<std::string> get_headers_list() const override;
+    std::string           get_component_code() const override;
+    Window                get_window() const override;
+    CLBuildOptions        generate_build_options() const override;
+    std::string           generate_config_id() const override;
+
+    virtual std::vector<Link> get_links() const override
+    {
+        return { _src, _dst };
+    }
+
+    virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+    virtual void allocate_shared_vars(SharedVarTable &vtable) const override;
+
+    virtual std::string name() const override
+    {
+        return "floor_" + std::to_string(id());
+    }
+
+private:
+    Link _src{};
+    Link _dst{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+\ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
index 26e50523a9..3f99dd5553 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
@@ -27,7 +27,8 @@
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h"
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
 
 #endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
diff --git a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp
index a335e5aada..bd88afdb47 100644
--- a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp
+++ b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp
@@ -198,7 +198,7 @@ void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMetho
     node->set_method(method);
 }
 
-Operator add_op_elementwise_add(OperatorGraph &graph, const AddDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst)
+Operator add_op_elementwise_op(OperatorGraph &graph, const ElementwiseDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst)
 {
     auto id = graph.impl()->graph.add_operator({ rhs.id(), lhs.id() }, { dst.id() });
     check_dependency_graph_op_success(graph, id.first);
@@ -224,7 +224,36 @@ Operator add_op_elementwise_add(OperatorGraph &graph, const AddDescriptor &desc,
     tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[lhs.id()].get());
     tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[rhs.id()].get());
     tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
-    graph.impl()->add_node<AddContent>(id.second, desc, tensors);
+    graph.impl()->add_node<ElementwiseContent>(id.second, desc, tensors);
+    check_multiple_roots(graph);
+
+    return op_node;
+}
+
+Operator add_op_floor(OperatorGraph &graph, const FloorDescriptor &desc, OpTensor src, OpTensor dst)
+{
+    auto id = graph.impl()->graph.add_operator({ src.id() }, { dst.id() });
+    check_dependency_graph_op_success(graph, id.first);
+
+    Operator op_node(id.second);
+
+    // Infer TensorInfo
+    auto             node_src = graph.impl()->tensors[src.id()]->get_tensor_info();
+    OpTensorContent *node_dst = graph.impl()->tensors[dst.id()].get();
+
+    if(node_dst->get_tensor_info()->total_size() == 0)
+    {
+        auto_init_if_empty(*(node_dst->get_tensor_info()), *node_src);
+    }
+
+    // Check execution space
+    auto dst_info = node_dst->get_tensor_info();
+    check_execution_shape(graph, *dst_info);
+
+    ITensorDescPack<OpTensorContent> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[src.id()].get());
+    tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
+    graph.impl()->add_node<FloorContent>(id.second, desc, tensors);
     check_multiple_roots(graph);
 
     return op_node;
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h
index a9ccf908f0..f10e97e3e9 100644
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h
@@ -42,14 +42,24 @@ struct ClDirectConv2dKernelDescriptor
     Conv2dDescriptor conv2d{};
 };
 
-struct ClEltwiseAddKernelDescriptor
+struct ClElementwiseKernelDescriptor
 {
-    friend bool operator==(const ClEltwiseAddKernelDescriptor &desc0, const ClEltwiseAddKernelDescriptor &desc1)
+    friend bool operator==(const ClElementwiseKernelDescriptor &desc0, const ClElementwiseKernelDescriptor &desc1)
     {
-        return desc0.add == desc1.add;
+        return desc0.eltwise == desc1.eltwise;
     }
-    AddDescriptor add{};
+    ElementwiseDescriptor eltwise{};
 };
+
+struct ClFloorKernelDescriptor
+{
+    friend bool operator==(const ClFloorKernelDescriptor &desc0, const ClFloorKernelDescriptor &desc1)
+    {
+        return desc0.floor == desc1.floor;
+    }
+    FloorDescriptor floor{};
+};
+
 struct ClActivationKernelDescriptor
 {
     friend bool operator==(const ClActivationKernelDescriptor &, const ClActivationKernelDescriptor &)
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp
index de58ce70ed..cab51a2ce6 100644
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp
@@ -124,7 +124,7 @@ bool ClDirectConv2dKernel::operator==(const ClKernel &other) const
     return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
 }
 
-Status ClAddKernel::generate(ClKernelBlueprint &bp) const
+Status ClElementwiseKernel::generate(ClKernelBlueprint &bp) const
 {
     const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
@@ -137,11 +137,11 @@ Status ClAddKernel::generate(ClKernelBlueprint &bp) const
     ArgumentID dst_id;
     add_tensor(bp, dst->desc, dst_id, dst->id);
 
-    add_kcomp_eltwise_add(bp, desc, lhs_id, rhs_id, dst_id);
+    add_kcomp_eltwise_op(bp, desc, lhs_id, rhs_id, dst_id);
     return Status{};
 }
 
-Status ClAddKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst)
+Status ClElementwiseKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst)
 {
     // 1. Check validity
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
@@ -186,9 +186,61 @@ Status ClAddKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, con
     return Status{};
 }
 
-bool ClAddKernel::operator==(const ClKernel &other) const
+bool ClElementwiseKernel::operator==(const ClKernel &other) const
 {
-    const auto converted = *utils::cast::polymorphic_downcast<const ClAddKernel *>(&other);
+    const auto converted = *utils::cast::polymorphic_downcast<const ClElementwiseKernel *>(&other);
+    return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
+}
+
+Status ClFloorKernel::generate(ClKernelBlueprint &bp) const
+{
+    const auto src = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ArgumentID src_id;
+    add_tensor(bp, src->desc, src_id, src->id);
+    ArgumentID dst_id;
+    add_tensor(bp, dst->desc, dst_id, dst->id);
+
+    add_kcomp_floor(bp, desc, src_id, dst_id);
+    return Status{};
+}
+
+Status ClFloorKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    // 1. Check validity
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+
+    // dst shape is correct
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(src->tensor_shape(), dst->tensor_shape(), 0), "Wrong shape for dst");
+
+    // 2. Check support level
+
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
+
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    return Status{};
+}
+
+bool ClFloorKernel::operator==(const ClKernel &other) const
+{
+    const auto converted = *utils::cast::polymorphic_downcast<const ClFloorKernel *>(&other);
     return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
 }
 
@@ -202,6 +254,7 @@ std::vector<const ClKernel *> traverse(const ClKernelGraph &graph)
     }
     return kernels;
 }
+
 std::vector<ClKernel *> traverse(ClKernelGraph &graph)
 {
     std::vector<ClKernel *> kernels;
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h
index 54e01ea850..c3580cfaca 100644
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h
@@ -139,16 +139,16 @@ public:
     ClDirectConv2dKernelDescriptor desc{};
 };
 
-struct ClAddKernel : public ClKernel
+struct ClElementwiseKernel : public ClKernel
 {
 public:
     Complexity complexity() const override
     {
         return Complexity::Simple;
     }
-    ClAddKernel()           = default;
-    ~ClAddKernel() override = default;
-    ClAddKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClEltwiseAddKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
+    ClElementwiseKernel()           = default;
+    ~ClElementwiseKernel() override = default;
+    ClElementwiseKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClElementwiseKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
         : ClKernel{ graph, id, config, tensors }, desc{ desc }
     {
     }
@@ -156,7 +156,27 @@ public:
     bool operator==(const ClKernel &other) const override;
     Status generate(ClKernelBlueprint &bp) const override;
 
-    ClEltwiseAddKernelDescriptor desc{};
+    ClElementwiseKernelDescriptor desc{};
+};
+
+struct ClFloorKernel : public ClKernel
+{
+public:
+    Complexity complexity() const override
+    {
+        return Complexity::Simple;
+    }
+    ClFloorKernel()           = default;
+    ~ClFloorKernel() override = default;
+    ClFloorKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClFloorKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
+        : ClKernel{ graph, id, config, tensors }, desc{ desc }
+    {
+    }
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+    bool operator==(const ClKernel &other) const override;
+    Status generate(ClKernelBlueprint &bp) const override;
+
+    ClFloorKernelDescriptor desc{};
 };
 
 struct ClKernelGraph
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp
index f971196729..274a2517bb 100644
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp
@@ -113,9 +113,14 @@ bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1
     return std::make_tuple(conv2d0.pad, conv2d0.stride, conv2d0.dilation) == std::make_tuple(conv2d1.pad, conv2d1.stride, conv2d1.dilation);
 }
 
-bool operator==(const AddDescriptor &, const AddDescriptor &)
+bool operator==(const ElementwiseDescriptor &ed0, const ElementwiseDescriptor &ed1)
 {
-    return std::make_tuple() == std::make_tuple(); // Currently two Add ops are always the same
+    return ed0.op == ed1.op; // Compare Arithmatic Operations of two ElementwiseDescriptor objects
+}
+
+bool operator==(const FloorDescriptor &, const FloorDescriptor &)
+{
+    return std::make_tuple() == std::make_tuple(); // Currently two Floor ops are always the same
 }
 
 bool Conv2dContent::operator==(const OperatorContent &other) const
@@ -124,9 +129,15 @@ bool Conv2dContent::operator==(const OperatorContent &other) const
     return desc == converted.desc;
 }
 
-bool AddContent::operator==(const OperatorContent &other) const
+bool ElementwiseContent::operator==(const OperatorContent &other) const
+{
+    const auto converted = *utils::cast::polymorphic_downcast<const ElementwiseContent *>(&other);
+    return desc == converted.desc;
+}
+
+bool FloorContent::operator==(const OperatorContent &other) const
 {
-    const auto converted = *utils::cast::polymorphic_downcast<const AddContent *>(&other);
+    const auto converted = *utils::cast::polymorphic_downcast<const FloorContent *>(&other);
     return desc == converted.desc;
 }
 
@@ -311,7 +322,7 @@ Status Conv2dContent::translate_direct_conv2d(ClKernelGraph &kernel_graph) const
     return Status{};
 }
 
-Status AddContent::translate(ClKernelGraph &kernel_graph) const
+Status ElementwiseContent::translate(ClKernelGraph &kernel_graph) const
 {
     const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
@@ -338,16 +349,46 @@ Status AddContent::translate(ClKernelGraph &kernel_graph) const
     DependencyGraph::Id add_id;
     ClKernelConfig      config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
 
-    st = ClAddKernel::validate(lhs->desc, rhs->desc, dst->desc);
+    st = ClElementwiseKernel::validate(lhs->desc, rhs->desc, dst->desc);
     ARM_COMPUTE_RETURN_ON_ERROR(st);
 
-    st = kernel_graph.add_kernel<ClAddKernel>(config, ClEltwiseAddKernelDescriptor{ desc }, tensors, add_id);
+    st = kernel_graph.add_kernel<ClElementwiseKernel>(config, ClElementwiseKernelDescriptor{ desc }, tensors, add_id);
     ARM_COMPUTE_RETURN_ON_ERROR(st);
     ARM_COMPUTE_UNUSED(add_id);
 
     return Status{};
 }
 
+Status FloorContent::translate(ClKernelGraph &kernel_graph) const
+{
+    const auto src = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    ITensorDescPack<ClKernelTensor> tensors;
+
+    DependencyGraph::Id src_id;
+    auto                st = add_kernel_tensor(kernel_graph, *_graph, *src, src_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(src_id));
+
+    DependencyGraph::Id dst_id;
+    st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id));
+
+    DependencyGraph::Id add_id;
+    ClKernelConfig      config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
+
+    st = ClFloorKernel::validate(src->desc, dst->desc);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+
+    st = kernel_graph.add_kernel<ClFloorKernel>(config, ClFloorKernelDescriptor{ desc }, tensors, add_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+
+    return Status{};
+}
+
 std::vector<const OperatorContent *> traverse(const OperatorGraph::Implementation &graph)
 {
     std::vector<const OperatorContent *> ops;
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h
index 2786d610e1..b303cdb9fc 100644
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h
@@ -157,19 +157,19 @@ private:
     Status translate_direct_conv2d(ClKernelGraph &kernel_graph) const;
 };
 
-class AddContent : public OperatorContent
+class ElementwiseContent : public OperatorContent
 {
 public:
-    AddContent() = default;
-    AddContent(const OperatorGraph::Implementation *graph, Id id, const AddDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
+    ElementwiseContent() = default;
+    ElementwiseContent(const OperatorGraph::Implementation *graph, Id id, const ElementwiseDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
         : OperatorContent(graph, id, tensors), desc(desc)
     {
     }
-    ~AddContent()                  = default;
-    AddContent(const AddContent &) = default;
-    AddContent &operator=(const AddContent &) = default;
-    AddContent(AddContent &&)                 = default;
-    AddContent &operator=(AddContent &&) = default;
+    ~ElementwiseContent()                          = default;
+    ElementwiseContent(const ElementwiseContent &) = default;
+    ElementwiseContent &operator=(const ElementwiseContent &) = default;
+    ElementwiseContent(ElementwiseContent &&)                 = default;
+    ElementwiseContent &operator=(ElementwiseContent &&) = default;
     bool operator==(const OperatorContent &other) const override;
     OperatorComplexity complexity() const override
     {
@@ -178,7 +178,31 @@ public:
     Status translate(ClKernelGraph &kernel_graph) const override;
 
 private:
-    AddDescriptor desc{};
+    ElementwiseDescriptor desc{};
+};
+
+class FloorContent : public OperatorContent
+{
+public:
+    FloorContent() = default;
+    FloorContent(const OperatorGraph::Implementation *graph, Id id, const FloorDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
+        : OperatorContent(graph, id, tensors), desc(desc)
+    {
+    }
+    ~FloorContent()                    = default;
+    FloorContent(const FloorContent &) = default;
+    FloorContent &operator=(const FloorContent &) = default;
+    FloorContent(FloorContent &&)                 = default;
+    FloorContent &operator=(FloorContent &&) = default;
+    bool operator==(const OperatorContent &other) const override;
+    OperatorComplexity complexity() const override
+    {
+        return OperatorComplexity::Simple;
+    }
+    Status translate(ClKernelGraph &kernel_graph) const override;
+
+private:
+    FloorDescriptor desc{};
 };
 
 struct OperatorGraph::Implementation
author	Michalis Spyrou <michalis.spyrou@arm.com>	2022-06-15 19:02:28 +0100
committer	Michalis Spyrou <michalis.spyrou@arm.com>	2022-06-27 14:05:05 +0000
commit	b1fcefddf3f59219a9d7930d607175b7e6c39347 (patch)
tree	34e95efded15194b3c8abe4ba3da308c3259301d /src
parent	41eb2d92c89274200d59ff97653e2bd66819b310 (diff)
download	ComputeLibrary-b1fcefddf3f59219a9d7930d607175b7e6c39347.tar.gz