From 668ccdcfb81bfab3a2d44cd1ddd956e83a2dfb09 Mon Sep 17 00:00:00 2001
From: Sang-Hoon Park <sang-hoon.park@arm.com>
Date: Wed, 3 Feb 2021 10:32:59 +0000
Subject: Add dynamic tensor support to CpuElementwise

The kernels and operators for binary and unary operations
are now capable of being configured with dynamic shapes and
computing windows at run-time.

Additionally, changing arguments' names is done
for consistency.

Partially Implements: COMPMID-4127

Change-Id: I48e5038692db667dec7cb2b2906fe5683214fe19
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4973
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/IKernel.h                         |   7 +-
 arm_compute/core/ITensorInfo.h                     |  24 ++-
 arm_compute/core/TensorInfo.h                      |   2 +-
 arm_compute/runtime/NEON/INEOperator.h             |   4 +
 src/core/IKernel.cpp                               |   7 +-
 src/core/Validate.cpp                              |   4 +-
 src/core/cpu/kernels/CpuElementwiseKernel.cpp      | 164 ++++++++++-----------
 src/core/cpu/kernels/CpuElementwiseKernel.h        | 122 +++++++--------
 src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp |  40 +++--
 src/core/cpu/kernels/CpuElementwiseUnaryKernel.h   |  18 +--
 src/core/helpers/WindowHelpers.h                   |  16 ++
 src/runtime/NEON/INEOperator.cpp                   |   7 +-
 src/runtime/cpu/operators/CpuElementwise.cpp       |  72 +++++----
 src/runtime/cpu/operators/CpuElementwise.h         | 150 ++++++++++---------
 src/runtime/cpu/operators/CpuElementwiseUnary.cpp  |  13 ++
 src/runtime/cpu/operators/CpuElementwiseUnary.h    |   3 +
 tests/Utils.h                                      |  53 +++++++
 tests/validation/NEON/ElementwiseDivision.cpp      |  28 ++++
 tests/validation/NEON/ElementwiseRsqrtLayer.cpp    |  19 +++
 .../validation/fixtures/ElementWiseUnaryFixture.h  |  35 ++++-
 .../fixtures/ElementwiseOperationsFixture.h        |  51 ++++++-
 21 files changed, 550 insertions(+), 289 deletions(-)
diff --git a/arm_compute/core/IKernel.h b/arm_compute/core/IKernel.h
index 11132f20a9..98fd18cc91 100644
--- a/arm_compute/core/IKernel.h
+++ b/arm_compute/core/IKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,11 @@ public:
      * @return The maximum window the kernel can be executed on.
      */
     const Window &window() const;
+    /** Function to check if the embedded window of this kernel has been configured
+     *
+     * @return True if the windows has been configured
+     */
+    bool is_window_configured() const;
 
 protected:
     /** Configure the kernel's window
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index 9ddafce7c0..0171e31086 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -41,8 +41,24 @@ class ITensorInfo : public misc::ICloneable<ITensorInfo>
 {
 public:
     using TensorDimsState = Coordinates;
-
-public:
+    /** Get the value representing dynamic dimension state
+     *
+     * @return Value representing dynamic dimension state
+     *
+     */
+    static constexpr int32_t get_dynamic_state_value()
+    {
+        return _dynamic_dimension;
+    }
+    /** Get the value representing static dimension state
+     *
+     * @return Value representing static dimension state
+     *
+     */
+    static constexpr int32_t get_static_state_value()
+    {
+        return _static_dimension;
+    }
     /** Default virtual destructor */
     virtual ~ITensorInfo() = default;
     /** Set the data type to the specified value.
@@ -297,6 +313,10 @@ public:
 
         return std::pair<TensorShape, ValidRegion>(bc_shape, bc_valid_region);
     }
+
+private:
+    static constexpr int32_t _dynamic_dimension = -1;
+    static constexpr int32_t _static_dimension  = 0;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 42a969e01b..633daca063 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -293,7 +293,7 @@ public:
     }
     bool is_dynamic() const override
     {
-        return std::find(std::cbegin(_dims_state), std::cend(_dims_state), -1) != std::cend(_dims_state);
+        return std::find(std::cbegin(_dims_state), std::cend(_dims_state), get_dynamic_state_value()) != std::cend(_dims_state);
     }
     ITensorInfo &set_is_resizable(bool is_resizable) override
     {
diff --git a/arm_compute/runtime/NEON/INEOperator.h b/arm_compute/runtime/NEON/INEOperator.h
index b21dc49b20..184a5959b4 100644
--- a/arm_compute/runtime/NEON/INEOperator.h
+++ b/arm_compute/runtime/NEON/INEOperator.h
@@ -34,6 +34,8 @@
 namespace arm_compute
 {
 class ICPPKernel;
+class Window;
+
 using INEKernel = ICPPKernel;
 namespace experimental
 {
@@ -63,6 +65,8 @@ public:
     MemoryRequirements workspace() const override;
 
 protected:
+    void run(ITensorPack &tensors, const Window &window);
+
     std::unique_ptr<INEKernel> _kernel;
     IRuntimeContext           *_ctx;
     MemoryRequirements         _workspace;
diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp
index 287cd04931..31f1ec7a3f 100644
--- a/src/core/IKernel.cpp
+++ b/src/core/IKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,6 +48,11 @@ BorderSize IKernel::border_size() const
     return BorderSize(0);
 }
 
+bool IKernel::is_window_configured() const
+{
+    return !((_window.x().start() == _window.x().end()) && (_window.x().end() == 0));
+}
+
 void IKernel::configure(const Window &window)
 {
     _window = window;
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index bd5e494e94..8bb507921a 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -167,7 +167,7 @@ arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *functi
                                                               const arm_compute::IKernel *kernel)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(kernel == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((kernel->window().x().start() == kernel->window().x().end()) && (kernel->window().x().end() == 0) && (kernel->window().x().step() == 0),
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(),
                                         function, file, line,
                                         "This kernel hasn't been configured.");
     return arm_compute::Status{};
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
index 1ac21acbc0..23e95f72d7 100644
--- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
@@ -72,9 +72,9 @@ static ElementwiseKernel generate_kernel(UKernelType *ukernel)
 
 template <ArithmeticOperation op>
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_arithm_func(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
-    ARM_COMPUTE_UNUSED(input2, output);
+    ARM_COMPUTE_UNUSED(src1, dst);
     static ElementwiseKernel kernels[] =
     {
 #if defined(__ARM_FEATURE_SVE)
@@ -103,7 +103,7 @@ configure_arithm_func(const ITensorInfo *input1, const ITensorInfo *input2, ITen
 
     for(const auto &uk : kernels)
     {
-        if(uk.is_selected(input1->data_type()))
+        if(uk.is_selected(src0->data_type()))
         {
             return uk.ukernel;
         }
@@ -113,10 +113,10 @@ configure_arithm_func(const ITensorInfo *input1, const ITensorInfo *input2, ITen
 }
 
 template <ComparisonOperation op>
-std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)>
-configure_comp_func(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
+configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
-    ARM_COMPUTE_UNUSED(input2, output);
+    ARM_COMPUTE_UNUSED(src1, dst);
     static ElementwiseKernel kernels[] =
     {
 #if defined(__ARM_FEATURE_SVE)
@@ -148,7 +148,7 @@ configure_comp_func(const ITensorInfo *input1, const ITensorInfo *input2, ITenso
 
     for(const auto &uk : kernels)
     {
-        if(uk.is_selected(input1->data_type()))
+        if(uk.is_selected(src0->data_type()))
         {
             return uk.ukernel;
         }
@@ -158,45 +158,43 @@ configure_comp_func(const ITensorInfo *input1, const ITensorInfo *input2, ITenso
 }
 } // namespace
 
-Status CpuElementwiseKernel::validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+Status CpuElementwiseKernel::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
-    const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+    const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
-    // Validate in case of configured output
-    if(output.total_size() > 0)
+    // Validate in case of configured dst
+    if(dst.total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
                                         "Wrong shape for output");
     }
 
     return Status{};
 }
 
-void CpuElementwiseKernel::configure_common(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuElementwiseKernel::configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-
-    // Configure kernel window
-    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
-    // Auto initialize output if not initialized
-    auto_init_if_empty(*output, out_shape, 1, input1->data_type());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    Window win = calculate_max_window(out_shape);
+    // If any of shapes is dynamic, expect a configured window and dst at run-time.
+    if(src0->is_dynamic() || src1->is_dynamic())
+    {
+        return;
+    }
 
-    ICpuKernel::configure(win);
+    auto shape_and_window = compute_output_shape_and_window(*src0, *src1);
+    auto_init_if_empty(*dst, shape_and_window.first, 1, src0->data_type());
+    ICpuKernel::configure(shape_and_window.second);
 }
 
 void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
-    ARM_COMPUTE_UNUSED(info, window);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_UNUSED(info);
 
     auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
@@ -208,49 +206,49 @@ void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, co
 }
 
 /** Arithmetic operators (min, max, squared_diff) */
-void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
-    configure_common(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
+    configure_common(src0, src1, dst);
     _op = op;
 }
 
-Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
-    // Validate in case of configured output
-    if(output.total_size() > 0)
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    // Validate in case of configured dst
+    if(dst.total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
     }
-    return validate_arguments_common(input1, input2, output);
+    return validate_arguments_common(src0, src1, dst);
 }
 
-Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
     ARM_COMPUTE_UNUSED(op);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
     return Status{};
 }
 
 std::function<CpuElementwiseKernel::ElementwiseFunction>
-CpuArithmeticKernel::get_implementation(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
     switch(_op)
     {
         case ArithmeticOperation::MAX:
-            return configure_arithm_func<ArithmeticOperation::MAX>(input1, input2, output);
+            return configure_arithm_func<ArithmeticOperation::MAX>(src0, src1, dst);
         case ArithmeticOperation::MIN:
-            return configure_arithm_func<ArithmeticOperation::MIN>(input1, input2, output);
+            return configure_arithm_func<ArithmeticOperation::MIN>(src0, src1, dst);
         case ArithmeticOperation::SQUARED_DIFF:
-            return configure_arithm_func<ArithmeticOperation::SQUARED_DIFF>(input1, input2, output);
+            return configure_arithm_func<ArithmeticOperation::SQUARED_DIFF>(src0, src1, dst);
         case ArithmeticOperation::PRELU:
-            return configure_arithm_func<ArithmeticOperation::PRELU>(input1, input2, output);
+            return configure_arithm_func<ArithmeticOperation::PRELU>(src0, src1, dst);
         case ArithmeticOperation::DIV:
-            return configure_arithm_func<ArithmeticOperation::DIV>(input1, input2, output);
+            return configure_arithm_func<ArithmeticOperation::DIV>(src0, src1, dst);
         case ArithmeticOperation::POWER:
-            return configure_arithm_func<ArithmeticOperation::POWER>(input1, input2, output);
+            return configure_arithm_func<ArithmeticOperation::POWER>(src0, src1, dst);
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }
@@ -259,91 +257,91 @@ CpuArithmeticKernel::get_implementation(const ITensorInfo *input1, const ITensor
 
 /** The division operator */
 
-void CpuDivisionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
-    configure_common(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
+    configure_common(src0, src1, dst);
     _op = ArithmeticOperation::DIV;
 }
 
-Status CpuDivisionKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::S32, DataType::F16, DataType::F32);
-    return CpuArithmeticKernel::validate_arguments(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32);
+    return CpuArithmeticKernel::validate_arguments(src0, src1, dst);
 }
 
-Status CpuDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuDivisionKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
     return Status{};
 }
 
 /** The power operator */
-void CpuPowerKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuPowerKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
-    configure_common(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
+    configure_common(src0, src1, dst);
     _op = ArithmeticOperation::POWER;
 }
 
-Status CpuPowerKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+Status CpuPowerKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::F16, DataType::F32);
-    return CpuArithmeticKernel::validate_arguments(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::F16, DataType::F32);
+    return CpuArithmeticKernel::validate_arguments(src0, src1, dst);
 }
 
-Status CpuPowerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
     return Status{};
 }
 
 /** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
-    configure_common(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
+    configure_common(src0, src1, dst);
     _op = op;
 }
 
-Status CpuComparisonKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
-    // Validate in case of configured output
-    if(output.total_size() > 0)
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    // Validate in case of configured dst
+    if(dst.total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8);
     }
-    return validate_arguments_common(input1, input2, output);
+    return validate_arguments_common(src0, src1, dst);
 }
 
-Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
     ARM_COMPUTE_UNUSED(op);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
     return Status{};
 }
 
 std::function<CpuElementwiseKernel::ElementwiseFunction>
-CpuComparisonKernel::get_implementation(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
     switch(_op)
     {
         case ComparisonOperation::Equal:
-            return configure_comp_func<ComparisonOperation::Equal>(input1, input2, output);
+            return configure_comp_func<ComparisonOperation::Equal>(src0, src1, dst);
         case ComparisonOperation::NotEqual:
-            return configure_comp_func<ComparisonOperation::NotEqual>(input1, input2, output);
+            return configure_comp_func<ComparisonOperation::NotEqual>(src0, src1, dst);
         case ComparisonOperation::Greater:
-            return configure_comp_func<ComparisonOperation::Greater>(input1, input2, output);
+            return configure_comp_func<ComparisonOperation::Greater>(src0, src1, dst);
         case ComparisonOperation::GreaterEqual:
-            return configure_comp_func<ComparisonOperation::GreaterEqual>(input1, input2, output);
+            return configure_comp_func<ComparisonOperation::GreaterEqual>(src0, src1, dst);
         case ComparisonOperation::Less:
-            return configure_comp_func<ComparisonOperation::Less>(input1, input2, output);
+            return configure_comp_func<ComparisonOperation::Less>(src0, src1, dst);
         case ComparisonOperation::LessEqual:
-            return configure_comp_func<ComparisonOperation::LessEqual>(input1, input2, output);
+            return configure_comp_func<ComparisonOperation::LessEqual>(src0, src1, dst);
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.h b/src/core/cpu/kernels/CpuElementwiseKernel.h
index 92cf880172..952c6e3e25 100644
--- a/src/core/cpu/kernels/CpuElementwiseKernel.h
+++ b/src/core/cpu/kernels/CpuElementwiseKernel.h
@@ -37,7 +37,7 @@ namespace kernels
 /** Interface for an element-wise operation kernel
  *
  * Element-wise operation is computed by:
- * @f[ output(x,y) = OP(input1(x,y), input2(x,y))@f]
+ * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f]
  *
  */
 class CpuElementwiseKernel : public ICpuKernel
@@ -53,9 +53,9 @@ public:
 
     /** Common signature for all the specialised arithmetic functions
      *
-     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor info. Data types supported: Dependent on subclass.
+     * @param[in]  src0   First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in]  src1   Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out] dst    Output tensor info. Data types supported: Dependent on subclass.
      * @param[in]  window Region on which to execute the kernel.
      */
     using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &);
@@ -66,26 +66,26 @@ public:
 protected:
     /** Validate the argument passed to the kernel
      *
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Dependent on subclass.
+     * @param[in] src0 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in] src1 Second tensor input. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor. Data types supported: Dependent on subclass.
      */
-    static Status validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+    static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
 
     /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
      *
      */
-    void configure_common(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
 
     /** Function to get the micro kernel implementation
      *
-     * @param[in] input1 First input tensor information
-     * @param[in] input2 Second input tensor information
-     * @param[in] output Output tensor information
+     * @param[in] src0 First input tensor information
+     * @param[in] src1 Second input tensor information
+     * @param[in] dst  Output tensor information
      *
      * @return the function instance for the micro kernel
      */
-    virtual std::function<ElementwiseFunction> get_implementation(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output) = 0;
+    virtual std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) = 0;
 };
 
 class CpuArithmeticKernel : public CpuElementwiseKernel
@@ -96,40 +96,40 @@ public:
 
     /** Configure kernel
      *
-     * @param[in]  op     Arithmetic operation to be executed.
-     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in]  op   Arithmetic operation to be executed.
+     * @param[in]  src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
      */
-    void configure(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
 
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel
      *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] op   Arithmetic operation to be executed.
+     * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
      *
      * @return a Status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
 protected:
     // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
 
     ArithmeticOperation _op{};
 
 private:
     /** Function to get the micro kernel implementation
      *
-     * @param[in] input1 First input tensor information
-     * @param[in] input2 Second input tensor information
-     * @param[in] output Output tensor information
+     * @param[in] src0 First input tensor information
+     * @param[in] src1 Second input tensor information
+     * @param[in] dst  Output tensor information
      *
      * @return the function instance for the micro kernel
      */
-    std::function<ElementwiseFunction> get_implementation(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output) override;
+    std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
 };
 
 class CpuDivisionKernel : public CpuArithmeticKernel
@@ -140,25 +140,25 @@ public:
 
     /** Configure kernel
      *
-     * @param[in]  input1 First tensor input info. Data types supported: S32/F16/F32.
-     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in]  src0 First tensor input info. Data types supported: S32/F16/F32.
+     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CpuDivisionKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: S32/F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
 protected:
     // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
 };
 
 class CpuPowerKernel : public CpuArithmeticKernel
@@ -169,25 +169,25 @@ public:
 
     /** Configure kernel
      *
-     * @param[in]  input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in]  src0 First tensor input info. Data types supported: F16/F32.
+     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CpuPowerKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] src0 First tensor input info. Data types supported: F16/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
 protected:
     // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
 };
 
 class CpuComparisonKernel : public CpuElementwiseKernel
@@ -198,38 +198,38 @@ public:
 
     /** Configure kernel
      *
-     * @param[in]  op     Comparison operation to be executed.
-     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor info. Data types supported: U8.
+     * @param[in]  op   Comparison operation to be executed.
+     * @param[in]  src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out] dst  Output tensor info. Data types supported: U8.
      */
-    void configure(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
 
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
      *
-     * @param[in] op     Comparison operation to be executed.
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U8.
+     * @param[in] op   Comparison operation to be executed.
+     * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: U8.
      *
      * @return a Status
      */
-    static Status validate(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
 protected:
     // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
 
 private:
     /** Function to get the micro kernel implementation
      *
-     * @param[in] input1 First input tensor information
-     * @param[in] input2 Second input tensor information
-     * @param[in] output Output tensor information
+     * @param[in] src0 First input tensor information
+     * @param[in] src1 Second input tensor information
+     * @param[in] dst  Output tensor information
      *
      * @return the function instance for the micro kernel
      */
-    std::function<ElementwiseFunction> get_implementation(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output) override;
+    std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
 
     ComparisonOperation _op{};
 };
diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 2b5c11f8e1..ff2d080c95 100644
--- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -108,28 +108,28 @@ CpuElementwiseUnaryKernel::CpuElementwiseUnaryKernel()
 {
 }
 
-void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &input, ITensorInfo &output)
+void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate(op, input, output));
-
-    // Configure kernel window
-    const TensorShape &out_shape = TensorShape::broadcast_shape(input.tensor_shape());
-
-    // Auto initialize output if not initialized
-    auto_init_if_empty(output, out_shape, 1, input.data_type());
-
-    Window win = calculate_max_window(out_shape);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst));
 
     _op = op;
 
-    ICpuKernel::configure(win);
+    // If input shape is dynamic, expect a configured window and dst at run-time.
+    if(src.is_dynamic())
+    {
+        return;
+    }
+
+    auto shape_and_window = compute_output_shape_and_window(src);
+    auto_init_if_empty(dst, shape_and_window.first, 1, src.data_type());
+    ICpuKernel::configure(shape_and_window.second);
 }
 
-Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &input, const ITensorInfo &output)
+Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
 
-    const auto *uk = get_implementation(input.data_type());
+    const auto *uk = get_implementation(src.data_type());
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     switch(op)
@@ -139,19 +139,19 @@ Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInf
         case ElementWiseUnary::LOG:
         case ElementWiseUnary::ROUND:
         case ElementWiseUnary::SIN:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32);
             break;
         case ElementWiseUnary::NEG:
         case ElementWiseUnary::ABS:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
             break;
         default:
             ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
     }
-    // Validate in case of configured output
-    if(output.total_size() > 0)
+    // Validate in case of configured dst
+    if(dst.total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
     }
 
     return Status{};
@@ -160,8 +160,6 @@ Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInf
 void CpuElementwiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
 
     auto src  = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h
index 193f6f1e4f..ceb90dcf70 100644
--- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h
+++ b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h
@@ -38,7 +38,7 @@ namespace kernels
 /** Interface for an element-wise unary operation kernel
  *
  * Element-wise operation is computed by:
- * @f[ output(x) = OP(input(x))@f]
+ * @f[ dst(x) = OP(src(x))@f]
  *
  */
 class CpuElementwiseUnaryKernel : public ICpuKernel
@@ -56,21 +56,21 @@ public:
 
     /** Function to configure the @ref CpuElementwiseUnaryKernel
      *
-     * @param[in]  op     Arithmetic operation to be executed.
-     * @param[in]  input  First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
-     * @param[out] output Output tensor. Data types supported: Same as @p input.
+     * @param[in]  op  Arithmetic operation to be executed.
+     * @param[in]  src First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
+     * @param[out] dst Output tensor. Data types supported: Same as @p src.
      */
-    void configure(ElementWiseUnary op, const ITensorInfo &input, ITensorInfo &output);
+    void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CpuElementwiseUnaryKernel
      *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input  First tensor input info. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     * @param[in] op  Arithmetic operation to be executed.
+     * @param[in] src First tensor input info. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
+     * @param[in] dst Output tensor info. Data types supported: Same as @p src.
      *
      * @return a Status
      */
-    static Status validate(ElementWiseUnary op, const ITensorInfo &input, const ITensorInfo &output);
+    static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h
index 9216c33f16..637e9323ab 100644
--- a/src/core/helpers/WindowHelpers.h
+++ b/src/core/helpers/WindowHelpers.h
@@ -177,6 +177,22 @@ inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps
 {
     return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
 }
+
+/** Function to compute the shape of output and window for the given inputs
+ *
+ * @param[in] infos Input tensor informations
+ *
+ * @return A pair of the shape and window
+ */
+template <typename... Infos>
+std::pair<TensorShape, Window> compute_output_shape_and_window(const Infos &... infos)
+{
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(infos...);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
+
+    return std::make_pair(out_shape, calculate_max_window(valid_region));
+}
 #endif /* DOXYGEN_SKIP_THIS */
 } // namespace arm_compute
 
diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp
index ccee8ffc21..a5fc0a2726 100644
--- a/src/runtime/NEON/INEOperator.cpp
+++ b/src/runtime/NEON/INEOperator.cpp
@@ -44,7 +44,12 @@ void INEOperator::run(ITensorPack &tensors)
         ARM_COMPUTE_ERROR("No inputs provided");
     }
 
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+    run(tensors, _kernel->window());
+}
+
+void INEOperator::run(ITensorPack &tensors, const Window &window)
+{
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, window, tensors);
 }
 
 void INEOperator::prepare(ITensorPack &constants)
diff --git a/src/runtime/cpu/operators/CpuElementwise.cpp b/src/runtime/cpu/operators/CpuElementwise.cpp
index 322bd09c43..b5c8dde925 100644
--- a/src/runtime/cpu/operators/CpuElementwise.cpp
+++ b/src/runtime/cpu/operators/CpuElementwise.cpp
@@ -23,95 +23,111 @@
  */
 #include "src/runtime/cpu/operators/CpuElementwise.h"
 #include "src/core/cpu/kernels/CpuElementwiseKernel.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void CpuElementwiseMax::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuElementwiseBase::run(ITensorPack &tensors)
+{
+    // If the kernel has been configured, use the window from the kernel.
+    if(_kernel->is_window_configured())
+    {
+        ICpuOperator::run(tensors);
+        return;
+    }
+
+    auto src0_info        = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info();
+    auto src1_info        = tensors.get_const_tensor(TensorType::ACL_SRC_1)->info();
+    auto shape_and_window = compute_output_shape_and_window(*src0_info, *src1_info);
+    ICpuOperator::run(tensors, shape_and_window.second);
+}
+
+void CpuElementwiseMax::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
     auto k = std::make_unique<kernels::CpuArithmeticKernel>();
-    k->configure(ArithmeticOperation::MAX, input1, input2, output);
+    k->configure(ArithmeticOperation::MAX, src0, src1, dst);
     _kernel = std::move(k);
 }
 
-Status CpuElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuElementwiseMax::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    return kernels::CpuArithmeticKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+    return kernels::CpuArithmeticKernel::validate(ArithmeticOperation::MAX, src0, src1, dst);
 }
 
-void CpuElementwiseMin::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuElementwiseMin::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
     auto k = std::make_unique<kernels::CpuArithmeticKernel>();
-    k->configure(ArithmeticOperation::MIN, input1, input2, output);
+    k->configure(ArithmeticOperation::MIN, src0, src1, dst);
     _kernel = std::move(k);
 }
 
-Status CpuElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuElementwiseMin::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    return kernels::CpuArithmeticKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+    return kernels::CpuArithmeticKernel::validate(ArithmeticOperation::MIN, src0, src1, dst);
 }
 
-void CpuElementwiseSquaredDiff::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuElementwiseSquaredDiff::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
     auto k = std::make_unique<kernels::CpuArithmeticKernel>();
-    k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+    k->configure(ArithmeticOperation::SQUARED_DIFF, src0, src1, dst);
     _kernel = std::move(k);
 }
 
-Status CpuElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuElementwiseSquaredDiff::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    return kernels::CpuArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+    return kernels::CpuArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src0, src1, dst);
 }
 
-void CpuElementwiseDivision::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuElementwiseDivision::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
     auto k = std::make_unique<kernels::CpuDivisionKernel>();
-    k->configure(input1, input2, output);
+    k->configure(src0, src1, dst);
     _kernel = std::move(k);
 }
 
-Status CpuElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuElementwiseDivision::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    return kernels::CpuDivisionKernel::validate(input1, input2, output);
+    return kernels::CpuDivisionKernel::validate(src0, src1, dst);
 }
 
-void CpuElementwisePower::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuElementwisePower::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
     auto k = std::make_unique<kernels::CpuPowerKernel>();
-    k->configure(input1, input2, output);
+    k->configure(src0, src1, dst);
     _kernel = std::move(k);
 }
 
-Status CpuElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuElementwisePower::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    return kernels::CpuPowerKernel::validate(input1, input2, output);
+    return kernels::CpuPowerKernel::validate(src0, src1, dst);
 }
 
 template <ComparisonOperation COP>
-void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
 {
     auto k = std::make_unique<kernels::CpuComparisonKernel>();
-    k->configure(COP, input1, input2, output);
+    k->configure(COP, src0, src1, dst);
     _kernel = std::move(k);
 }
 
 template <ComparisonOperation COP>
-Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    return kernels::CpuComparisonKernel::validate(COP, input1, input2, output);
+    return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst);
 }
 
-void CpuElementwiseComparison::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ComparisonOperation op)
+void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op)
 {
     auto k = std::make_unique<kernels::CpuComparisonKernel>();
-    k->configure(op, input1, input2, output);
+    k->configure(op, src0, src1, dst);
     _kernel = std::move(k);
 }
 
-Status CpuElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op)
 {
-    return kernels::CpuComparisonKernel::validate(op, input1, input2, output);
+    return kernels::CpuComparisonKernel::validate(op, src0, src1, dst);
 }
 
 // Supported Specializations
diff --git a/src/runtime/cpu/operators/CpuElementwise.h b/src/runtime/cpu/operators/CpuElementwise.h
index 611a374c26..4b350d5f9f 100644
--- a/src/runtime/cpu/operators/CpuElementwise.h
+++ b/src/runtime/cpu/operators/CpuElementwise.h
@@ -30,30 +30,36 @@ namespace arm_compute
 {
 namespace cpu
 {
+class CpuElementwiseBase : public ICpuOperator
+{
+public:
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
 /** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for max
  *
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a max operation between two tensors.
  */
-class CpuElementwiseMax : public ICpuOperator
+class CpuElementwiseMax : public CpuElementwiseBase
 {
 public:
-    /** Initialise the kernel's inputs, output and conversion policy.
+    /** Initialise the kernel's inputs, dst and conversion policy.
      *
-     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: Same as @p src0.
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for max
      *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 };
 
 /** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for min
@@ -61,25 +67,25 @@ public:
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a min operation between two tensors.
  */
-class CpuElementwiseMin : public ICpuOperator
+class CpuElementwiseMin : public CpuElementwiseBase
 {
 public:
-    /** Initialise the kernel's inputs, output and conversion policy.
+    /** Initialise the kernel's inputs, dst and conversion policy.
      *
-     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: Same as @p src0.
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for min
      *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 };
 
 /** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for squared difference
@@ -87,25 +93,25 @@ public:
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
  */
-class CpuElementwiseSquaredDiff : public ICpuOperator
+class CpuElementwiseSquaredDiff : public CpuElementwiseBase
 {
 public:
-    /** Initialise the kernel's inputs, output and conversion policy.
+    /** Initialise the kernel's inputs, dst and conversion policy.
      *
-     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: Same as @p src0.
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for squared difference
      *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 };
 
 /** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division
@@ -113,25 +119,25 @@ public:
  * @note The tensor data type for the inputs must be S32/F16/F32.
  * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i])
  */
-class CpuElementwiseDivision : public ICpuOperator
+class CpuElementwiseDivision : public CpuElementwiseBase
 {
 public:
-    /** Initialise the kernel's inputs, output and conversion policy.
+    /** Initialise the kernel's inputs, dst and conversion policy.
      *
-     * @param[in, out] input1 First tensor input info. Data types supported: S32/F16/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in, out] src0 First tensor input info. Data types supported: S32/F16/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: Same as @p src0.
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division
      *
-     * @param[in] input1 First tensor input info. Data types supported: S32/F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 };
 
 /** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power
@@ -140,25 +146,25 @@ public:
  * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
  * @note For an exponent that is a float, this function will only work with a positive base.
  */
-class CpuElementwisePower : public ICpuOperator
+class CpuElementwisePower : public CpuElementwiseBase
 {
 public:
-    /** Initialise the kernel's inputs, output and conversion policy.
+    /** Initialise the kernel's inputs, dst and conversion policy.
      *
-     * @param[in, out] input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in, out] src0 First tensor input info. Data types supported: F16/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: Same as @p src0.
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power
      *
-     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] src0 First tensor input info. Data types supported: F16/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: Same as @p src0.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 };
 
 /** Basic function to run @ref cpu::kernels::CpuComparisonKernel.
@@ -166,27 +172,27 @@ public:
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a comparison operation between two tensors.
  */
-class CpuElementwiseComparison : public ICpuOperator
+class CpuElementwiseComparison : public CpuElementwiseBase
 {
 public:
-    /** Initialise the kernel's inputs, output and conversion policy.
+    /** Initialise the kernel's inputs, dst and conversion policy.
      *
-     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: U16/U32.
-     * @param[in]      op     Comparison Operation to be performed.
+     * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: U16/U32.
+     * @param[in]      op   Comparison Operation to be performed.
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ComparisonOperation op);
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op);
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
-     * @param[in] op     Comparison Operation to be performed.
+     * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: U16/U32.
+     * @param[in] op   Comparison Operation to be performed.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
 };
 
 /** Basic function to run @ref cpu::kernels::CpuComparisonKernel
@@ -195,25 +201,25 @@ public:
  * @note The function performs a comparison operation between two tensors.
  */
 template <ComparisonOperation op>
-class CpuElementwiseComparisonStatic : public ICpuOperator
+class CpuElementwiseComparisonStatic : public CpuElementwiseBase
 {
 public:
-    /** Initialise the kernel's inputs, output and conversion policy.
+    /** Initialise the kernel's inputs, dst and conversion policy.
      *
-     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: U16/U32.
+     * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: U16/U32.
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
+     * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[in] dst  Output tensor info. Data types supported: U16/U32.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 };
 
 /** Basic function to run equal comparison. */
diff --git a/src/runtime/cpu/operators/CpuElementwiseUnary.cpp b/src/runtime/cpu/operators/CpuElementwiseUnary.cpp
index d1b1700927..2140c5cf78 100644
--- a/src/runtime/cpu/operators/CpuElementwiseUnary.cpp
+++ b/src/runtime/cpu/operators/CpuElementwiseUnary.cpp
@@ -23,6 +23,7 @@
  */
 #include "src/runtime/cpu/operators/CpuElementwiseUnary.h"
 #include "src/core/cpu/kernels/CpuElementwiseUnaryKernel.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
@@ -41,5 +42,17 @@ Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src
 {
     return KernelType::validate(op, src, dst);
 }
+
+void CpuElementwiseUnary::run(ITensorPack &tensors)
+{
+    if(_kernel->is_window_configured())
+    {
+        ICpuOperator::run(tensors);
+        return;
+    }
+
+    auto src_info = tensors.get_const_tensor(TensorType::ACL_SRC)->info();
+    ICpuOperator::run(tensors, compute_output_shape_and_window(*src_info).second);
+}
 } // namespace cpu
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuElementwiseUnary.h b/src/runtime/cpu/operators/CpuElementwiseUnary.h
index 0b2a9e730d..721ba2a85b 100644
--- a/src/runtime/cpu/operators/CpuElementwiseUnary.h
+++ b/src/runtime/cpu/operators/CpuElementwiseUnary.h
@@ -50,6 +50,9 @@ public:
      * @return a status
      */
     static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
 };
 
 } // namespace cpu
diff --git a/tests/Utils.h b/tests/Utils.h
index 2569c41a9e..fe9fe712cf 100644
--- a/tests/Utils.h
+++ b/tests/Utils.h
@@ -814,6 +814,59 @@ inline void sync_tensor_if_necessary(TensorType &tensor)
 {
     ARM_COMPUTE_UNUSED(tensor);
 }
+
+/** Construct and return object for dimensions' state filled with the given value
+ *
+ * @param[in] value The value to fill
+ *
+ * @return Constructed class
+ */
+inline ITensorInfo::TensorDimsState construct_dims_state(int32_t value)
+{
+    auto states = ITensorInfo::TensorDimsState{};
+    std::fill(states.begin(), states.end(), value);
+    return states;
+}
+
+/** Construct and return object for dimensions' state filled with the value for dynamic state
+ *
+ * @return Constructed class filled with the value for dynamic state
+ */
+inline ITensorInfo::TensorDimsState construct_dynamic_dims_state()
+{
+    return construct_dims_state(ITensorInfo::get_dynamic_state_value());
+}
+
+/** Construct and return object for dimensions' state filled with the value for non-dynamic state
+ *
+ * @return Constructed class filled with the value for non-dynamic state
+ */
+inline ITensorInfo::TensorDimsState construct_static_dims_state()
+{
+    return construct_dims_state(ITensorInfo::get_static_state_value());
+}
+
+/** Set the dimension states of the given tensor to dynamic
+ *
+ * @param[in] t The tensor to set to dynamic state
+ *
+ */
+template <typename TensorType>
+void set_tensor_dynamic(TensorType &t)
+{
+    t.info()->set_tensor_dims_state(construct_dynamic_dims_state());
+}
+
+/** Set the dimension states of the given tensor to state
+ *
+ * @param[in] t The tensor to set to static state
+ *
+ */
+template <typename TensorType>
+void set_tensor_static(TensorType &t)
+{
+    t.info()->set_tensor_dims_state(construct_static_dims_state());
+}
 } // namespace test
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_TEST_UTILS_H */
diff --git a/tests/validation/NEON/ElementwiseDivision.cpp b/tests/validation/NEON/ElementwiseDivision.cpp
index 3656560281..8abccb2ed6 100644
--- a/tests/validation/NEON/ElementwiseDivision.cpp
+++ b/tests/validation/NEON/ElementwiseDivision.cpp
@@ -93,6 +93,34 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
+// Test test cases will execute the function with dynamic-stated shapes
+// Since other elementwise operations share the same kernel, this tests are added only here.
+// Also, only FP32 is tested since data type doesn't/shouldn't matter with dynamic shapes.
+TEST_SUITE(DynamicShape)
+template <typename T>
+using CpuElementwiseDivisionDynamicShapeFixture = ArithmeticDivisionDynamicShapeValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
+
+template <typename T>
+using CpuElementwiseDivisionBroadcastDynamicShapeFixture = ArithmeticDivisionBroadcastDynamicShapeValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
+
+TEST_SUITE(F32)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuElementwiseDivisionDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CpuElementwiseDivisionBroadcastDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
+                       ElementwiseDivisionFP32Dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+
+TEST_SUITE_END() // F32
+TEST_SUITE_END() // DynamicShape
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
diff --git a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
index f41500cc0b..1591b76cd7 100644
--- a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
+++ b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
@@ -50,6 +50,25 @@ RelativeTolerance<float> tolerance_fp16(0.01f);
 TEST_SUITE(NEON)
 TEST_SUITE(RsqrtLayer)
 
+// Test test cases will execute the function with dynamic-stated shapes
+// Since other elementwise unary operations share the same kernel, this tests are added only here.
+// Also, only FP32 is tested since data type doesn't/shouldn't matter with dynamic shapes.
+TEST_SUITE(DynamicShape)
+TEST_SUITE(FP32)
+
+template <typename T>
+using CpuRsqrtDynamicShapeFixture = RsqrtDynamicShapeValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuRsqrtDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                          DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // DynamicShape
+
 template <typename T>
 using NERsqrtLayerFixture = RsqrtValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
 
diff --git a/tests/validation/fixtures/ElementWiseUnaryFixture.h b/tests/validation/fixtures/ElementWiseUnaryFixture.h
index 8cffef48f6..9729907630 100644
--- a/tests/validation/fixtures/ElementWiseUnaryFixture.h
+++ b/tests/validation/fixtures/ElementWiseUnaryFixture.h
@@ -44,11 +44,12 @@ class ElementWiseUnaryValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, DataType input_data_type, bool in_place, ElementWiseUnary op)
+    void setup(TensorShape input_shape, DataType input_data_type, bool in_place, ElementWiseUnary op, bool use_dynamic_shape = false)
     {
-        _op        = op;
-        _target    = compute_target(input_shape, input_data_type, in_place);
-        _reference = compute_reference(input_shape, input_data_type);
+        _op                = op;
+        _target            = compute_target(input_shape, input_data_type, in_place);
+        _reference         = compute_reference(input_shape, input_data_type);
+        _use_dynamic_shape = use_dynamic_shape;
     }
 
 protected:
@@ -131,10 +132,24 @@ protected:
 
         TensorType *actual_dst = in_place ? &src : &dst;
 
+        // if _use_dynamic_shape is true, this fixture will test scenario for dynamic shapes.
+        // - At configure time, all input tensors are marked as dynamic using set_tensor_dynamic()
+        // - After configure, tensors are marked as static for run using set_tensor_static()
+        // - The tensors with static shape are given to run()
+        if(_use_dynamic_shape)
+        {
+            set_tensor_dynamic(src);
+        }
+
         // Create and configure function
         FunctionType elwiseunary_layer;
         elwiseunary_layer.configure(&src, actual_dst);
 
+        if(_use_dynamic_shape)
+        {
+            set_tensor_static(src);
+        }
+
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         src.allocator()->allocate();
         ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -175,6 +190,7 @@ protected:
     TensorType       _target{};
     SimpleTensor<T>  _reference{};
     ElementWiseUnary _op{};
+    bool             _use_dynamic_shape{ false };
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -188,6 +204,17 @@ public:
     }
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class RsqrtDynamicShapeValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(const TensorShape &shape, DataType data_type)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::RSQRT, true);
+    }
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class ExpValidationFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
 {
diff --git a/tests/validation/fixtures/ElementwiseOperationsFixture.h b/tests/validation/fixtures/ElementwiseOperationsFixture.h
index dcb408c801..bf51c7e69b 100644
--- a/tests/validation/fixtures/ElementwiseOperationsFixture.h
+++ b/tests/validation/fixtures/ElementwiseOperationsFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,9 +48,11 @@ public:
     template <typename...>
     void setup(ArithmeticOperation op, const TensorShape &shape0, const TensorShape &shape1,
                DataType data_type0, DataType data_type1, DataType output_data_type,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool use_dyanmic_shape = false)
     {
-        _op        = op;
+        _op                = op;
+        _use_dynamic_shape = use_dyanmic_shape;
+
         _target    = compute_target(shape0, shape1, data_type0, data_type1, output_data_type, qinfo0, qinfo1, qinfo_out);
         _reference = compute_reference(shape0, shape1, data_type0, data_type1, output_data_type, qinfo0, qinfo1, qinfo_out);
     }
@@ -87,10 +89,26 @@ protected:
         TensorType ref_src2 = create_tensor<TensorType>(shape1, data_type1, 1, qinfo1);
         TensorType dst      = create_tensor<TensorType>(TensorShape::broadcast_shape(shape0, shape1), output_data_type, 1, qinfo_out);
 
+        // if _use_dynamic_shape is true, this fixture will test scenario for dynamic shapes.
+        // - At configure time, all input tensors are marked as dynamic using set_tensor_dynamic()
+        // - After configure, tensors are marked as static for run using set_tensor_static()
+        // - The tensors with static shape are given to run()
+        if(_use_dynamic_shape)
+        {
+            set_tensor_dynamic(ref_src1);
+            set_tensor_dynamic(ref_src2);
+        }
+
         // Create and configure function
         FunctionType elem_op;
         elem_op.configure(&ref_src1, &ref_src2, &dst);
 
+        if(_use_dynamic_shape)
+        {
+            set_tensor_static(ref_src1);
+            set_tensor_static(ref_src2);
+        }
+
         ARM_COMPUTE_EXPECT(ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -133,6 +151,7 @@ protected:
     TensorType          _target{};
     SimpleTensor<T>     _reference{};
     ArithmeticOperation _op{ ArithmeticOperation::ADD };
+    bool                _use_dynamic_shape{ false };
 };
 
 // Arithmetic operation fused with activation function
@@ -225,6 +244,32 @@ public:
     }
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArithmeticDivisionBroadcastDynamicShapeValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type)
+    {
+        ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape0, shape1,
+                                                                                             data_type0, data_type1, output_data_type,
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), true);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArithmeticDivisionDynamicShapeValidationFixture : public ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type)
+    {
+        ArithmeticOperationsGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(ArithmeticOperation::DIV, shape, shape,
+                                                                                             data_type0, data_type1, output_data_type,
+                                                                                             QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), true);
+    }
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class ArithmeticDivisionBroadcastValidationFloatFixture : public ArithmeticOperationsFuseActivationFixture<TensorType, AccessorType, FunctionType, T>
 {
-- 
cgit v1.2.1