From 1e0208a66ddea1be2d0e715591598c6704660811 Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Fri, 22 Jan 2021 15:42:59 +0000
Subject: Make CLArithmeticAddition kernel and function state-less

Resolves COMPMID-4006

Change-Id: Iddc32b0b250142aac9a4a7b9dc0eef462d196025
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4913
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Sang-Hoon Park <sang-hoon.park@arm.com>
---
 .../CL/functions/CLElementwiseOperations.cpp       |  64 +++++--------
 src/runtime/CL/functions/CLLogicalAnd.cpp          |  10 +--
 src/runtime/CL/functions/CLLogicalOr.cpp           |  10 +--
 src/runtime/CL/functions/CLPReluLayer.cpp          |   8 +-
 src/runtime/NEON/functions/NELogical.cpp           |  12 +--
 src/runtime/gpu/cl/operators/ClAdd.cpp             |  47 ++++++++++
 src/runtime/gpu/cl/operators/ClAdd.h               | 100 +++++++++++++++++++++
 7 files changed, 189 insertions(+), 62 deletions(-)
 create mode 100644 src/runtime/gpu/cl/operators/ClAdd.cpp
 create mode 100644 src/runtime/gpu/cl/operators/ClAdd.h

(limited to 'src/runtime')
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index a72e957fe6..638990e472 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,9 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
+
+#include "src/runtime/gpu/cl/operators/ClAdd.h"
 
 #include <utility>
 
@@ -33,34 +35,13 @@ namespace arm_compute
 {
 namespace experimental
 {
-CLArithmeticAddition::CLArithmeticAddition()
-{
-}
-
-void CLArithmeticAddition::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<CLSaturatedArithmeticOperationKernel>();
-    k->configure(compile_context, ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
-    _kernel = std::move(k);
-}
-
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
-}
-
-void CLArithmeticAddition::run(ITensorPack &tensors)
-{
-    ICLOperator::run(tensors);
-}
-
 CLArithmeticSubtraction::CLArithmeticSubtraction()
 {
 }
 void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy,
                                         const ActivationLayerInfo &act_info)
 {
-    auto k = std::make_unique<CLSaturatedArithmeticOperationKernel>();
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClSaturatedArithmeticKernel>();
     k->configure(compile_context, ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
     _kernel = std::move(k);
 }
@@ -68,7 +49,7 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context,
 Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(policy);
-    return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
+    return arm_compute::opencl::kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
 }
 
 void CLArithmeticSubtraction::run(ITensorPack &tensors)
@@ -82,14 +63,14 @@ CLArithmeticDivision::CLArithmeticDivision()
 
 void CLArithmeticDivision::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    auto k = std::make_unique<CLArithmeticOperationKernel>();
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClArithmeticKernel>();
     k->configure(compile_context, ArithmeticOperation::DIV, input1, input2, output, act_info);
     _kernel = std::move(k);
 }
 
 Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output, act_info);
+    return arm_compute::opencl::kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, input1, input2, output, act_info);
 }
 
 void CLArithmeticDivision::run(ITensorPack &tensors)
@@ -103,14 +84,14 @@ CLElementwiseMax::CLElementwiseMax()
 
 void CLElementwiseMax::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    auto k = std::make_unique<CLArithmeticOperationKernel>();
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClArithmeticKernel>();
     k->configure(compile_context, ArithmeticOperation::MAX, input1, input2, output, act_info);
     _kernel = std::move(k);
 }
 
 Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output, act_info);
+    return arm_compute::opencl::kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, input1, input2, output, act_info);
 }
 
 void CLElementwiseMax::run(ITensorPack &tensors)
@@ -124,14 +105,14 @@ CLElementwiseMin::CLElementwiseMin()
 
 void CLElementwiseMin::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    auto k = std::make_unique<CLArithmeticOperationKernel>();
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClArithmeticKernel>();
     k->configure(compile_context, ArithmeticOperation::MIN, input1, input2, output, act_info);
     _kernel = std::move(k);
 }
 
 Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output, act_info);
+    return arm_compute::opencl::kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, input1, input2, output, act_info);
 }
 
 void CLElementwiseMin::run(ITensorPack &tensors)
@@ -145,14 +126,14 @@ CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
 
 void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    auto k = std::make_unique<CLArithmeticOperationKernel>();
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClArithmeticKernel>();
     k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
     _kernel = std::move(k);
 }
 
 Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
+    return arm_compute::opencl::kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
 }
 
 void CLElementwiseSquaredDiff::run(ITensorPack &tensors)
@@ -166,14 +147,14 @@ CLElementwisePower::CLElementwisePower()
 
 void CLElementwisePower::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    auto k = std::make_unique<CLArithmeticOperationKernel>();
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClArithmeticKernel>();
     k->configure(compile_context, ArithmeticOperation::POWER, input1, input2, output, act_info);
     _kernel = std::move(k);
 }
 
 Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::POWER, input1, input2, output, act_info);
+    return arm_compute::opencl::kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, input1, input2, output, act_info);
 }
 
 void CLElementwisePower::run(ITensorPack &tensors)
@@ -181,13 +162,12 @@ void CLElementwisePower::run(ITensorPack &tensors)
     ICLOperator::run(tensors);
 }
 } // namespace experimental
-
 struct CLArithmeticAddition::Impl
 {
-    const ICLTensor                                    *src_0{ nullptr };
-    const ICLTensor                                    *src_1{ nullptr };
-    ICLTensor                                          *dst{ nullptr };
-    std::unique_ptr<experimental::CLArithmeticAddition> op{ nullptr };
+    const ICLTensor               *src_0{ nullptr };
+    const ICLTensor               *src_1{ nullptr };
+    ICLTensor                     *dst{ nullptr };
+    std::unique_ptr<opencl::ClAdd> op{ nullptr };
 };
 
 CLArithmeticAddition::CLArithmeticAddition()
@@ -209,13 +189,13 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, co
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
-    _impl->op    = std::make_unique<experimental::CLArithmeticAddition>();
+    _impl->op    = std::make_unique<opencl::ClAdd>();
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
 }
 
 Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
-    return experimental::CLArithmeticAddition::validate(input1, input2, output, policy, act_info);
+    return opencl::ClAdd::validate(input1, input2, output, policy, act_info);
 }
 
 void CLArithmeticAddition::run()
diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp
index f1c53651c7..98c98abed5 100644
--- a/src/runtime/CL/functions/CLLogicalAnd.cpp
+++ b/src/runtime/CL/functions/CLLogicalAnd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
@@ -33,14 +33,14 @@ namespace experimental
 {
 void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
-    auto k = std::make_unique<CLLogicalBinaryKernel>();
-    k->configure(compile_context, kernels::LogicalOperation::And, input1, input2, output);
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
+    k->configure(compile_context, LogicalOperation::And, input1, input2, output);
     _kernel = std::move(k);
 }
 
 Status CLLogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
-    return CLLogicalBinaryKernel::validate(kernels::LogicalOperation::And, input1, input2, output);
+    return arm_compute::opencl::kernels::ClLogicalBinaryKernel::validate(LogicalOperation::And, input1, input2, output);
 }
 
 void CLLogicalAnd::run(ITensorPack &tensors)
diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp
index 8c6087ed7d..897963ab50 100644
--- a/src/runtime/CL/functions/CLLogicalOr.cpp
+++ b/src/runtime/CL/functions/CLLogicalOr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
@@ -33,14 +33,14 @@ namespace experimental
 {
 void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
-    auto k = std::make_unique<CLLogicalBinaryKernel>();
-    k->configure(compile_context, kernels::LogicalOperation::Or, input1, input2, output);
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
+    k->configure(compile_context, LogicalOperation::Or, input1, input2, output);
     _kernel = std::move(k);
 }
 
 Status CLLogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
-    return CLLogicalBinaryKernel::validate(kernels::LogicalOperation::Or, input1, input2, output);
+    return arm_compute::opencl::kernels::ClLogicalBinaryKernel::validate(LogicalOperation::Or, input1, input2, output);
 }
 
 void CLLogicalOr::run(ITensorPack &tensors)
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index 876b5de0f7..74286d46ca 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -37,14 +37,14 @@ CLPReluLayer::CLPReluLayer()
 
 void CLPReluLayer::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
 {
-    auto k = std::make_unique<CLArithmeticOperationKernel>();
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClArithmeticKernel>();
     k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, output);
     _kernel = std::move(k);
 }
 
 Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
+    return arm_compute::opencl::kernels::ClArithmeticKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
 }
 
 void CLPReluLayer::run(ITensorPack &tensors)
diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp
index 190998b042..674ba40fcd 100644
--- a/src/runtime/NEON/functions/NELogical.cpp
+++ b/src/runtime/NEON/functions/NELogical.cpp
@@ -50,7 +50,7 @@ void NELogicalAnd::configure(const ITensor *input1, const ITensor *input2, ITens
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
     _impl->kernel = std::make_unique<kernels::NELogicalKernel>();
-    _impl->kernel->configure(input1->info(), input2->info(), output->info(), kernels::LogicalOperation::And);
+    _impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::And);
 
     _impl->pack = ITensorPack();
     _impl->pack.add_tensor(TensorType::ACL_SRC_0, input1);
@@ -60,7 +60,7 @@ void NELogicalAnd::configure(const ITensor *input1, const ITensor *input2, ITens
 
 Status NELogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
-    return kernels::NELogicalKernel::validate(input1, input2, output, kernels::LogicalOperation::And);
+    return kernels::NELogicalKernel::validate(input1, input2, output, LogicalOperation::And);
 }
 
 void NELogicalAnd::run()
@@ -83,7 +83,7 @@ void NELogicalOr::configure(const ITensor *input1, const ITensor *input2, ITenso
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
     _impl->kernel = std::make_unique<kernels::NELogicalKernel>();
-    _impl->kernel->configure(input1->info(), input2->info(), output->info(), kernels::LogicalOperation::Or);
+    _impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::Or);
 
     _impl->pack = ITensorPack();
     _impl->pack.add_tensor(TensorType::ACL_SRC_0, input1);
@@ -93,7 +93,7 @@ void NELogicalOr::configure(const ITensor *input1, const ITensor *input2, ITenso
 
 Status NELogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
-    return kernels::NELogicalKernel::validate(input1, input2, output, kernels::LogicalOperation::Or);
+    return kernels::NELogicalKernel::validate(input1, input2, output, LogicalOperation::Or);
 }
 
 void NELogicalOr::run()
@@ -116,7 +116,7 @@ void NELogicalNot::configure(const ITensor *input, ITensor *output)
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     _impl->kernel = std::make_unique<kernels::NELogicalKernel>();
-    _impl->kernel->configure(input->info(), nullptr, output->info(), kernels::LogicalOperation::Not);
+    _impl->kernel->configure(input->info(), nullptr, output->info(), LogicalOperation::Not);
 
     _impl->pack = ITensorPack();
     _impl->pack.add_tensor(TensorType::ACL_SRC_0, input);
@@ -125,7 +125,7 @@ void NELogicalNot::configure(const ITensor *input, ITensor *output)
 
 Status NELogicalNot::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return kernels::NELogicalKernel::validate(input, nullptr, output, kernels::LogicalOperation::Not);
+    return kernels::NELogicalKernel::validate(input, nullptr, output, LogicalOperation::Not);
 }
 
 void NELogicalNot::run()
diff --git a/src/runtime/gpu/cl/operators/ClAdd.cpp b/src/runtime/gpu/cl/operators/ClAdd.cpp
new file mode 100644
index 0000000000..01f550f819
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClAdd.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/gpu/cl/operators/ClAdd.h"
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
+                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClAdd.h b/src/runtime/gpu/cl/operators/ClAdd.h
new file mode 100644
index 0000000000..2854c16180
--- /dev/null
+++ b/src/runtime/gpu/cl/operators/ClAdd.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ADD_H
+#define ARM_COMPUTE_CL_ADD_H
+
+#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/runtime/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run arithmetic addition
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @note The function performs an arithmetic addition between two tensors.
+ */
+class ClAdd : public IClOperator
+{
+public:
+    /** Default Constructor */
+    ClAdd() = default;
+    /** Configure function for a given list of arguments.
+     *
+     * Valid configurations (src1,src2) -> dst :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] src1            First source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2            Second source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst             Destination tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy          Policy to use to handle overflow.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref ClAdd
+     *
+     * Valid configurations (src1,src2) -> dst :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in] src1     First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] src2     Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] dst      Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] policy   Policy to use to handle overflow.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ADD_H */
-- 
cgit v1.2.1