aboutsummaryrefslogtreecommitdiff
path: root/src/gpu/cl/operators
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2021-08-20 21:39:25 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-08-25 16:23:15 +0000
commit7891a73ef36f4ad7b71069b3c57694f85bb79454 (patch)
tree5b08692989e28ce63de2937d8d92ea5176589dbe /src/gpu/cl/operators
parenta46c9c98c2b1d70acc7c6eee00e2cdc2a1e209a6 (diff)
downloadComputeLibrary-7891a73ef36f4ad7b71069b3c57694f85bb79454.tar.gz
Move CPU/GPU files from Core/Runtime to the respective backend folders
Legacy structure contained two libraries core/runtime with two backends in each. We reduce the core/runtime libraries to a single library thus merging the backend files Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I69545765fe7a730368105cdbd067d3135ec7a174 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6155 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/gpu/cl/operators')
-rw-r--r--src/gpu/cl/operators/ClActivation.cpp80
-rw-r--r--src/gpu/cl/operators/ClActivation.h56
-rw-r--r--src/gpu/cl/operators/ClAdd.cpp47
-rw-r--r--src/gpu/cl/operators/ClAdd.h80
-rw-r--r--src/gpu/cl/operators/ClCast.cpp45
-rw-r--r--src/gpu/cl/operators/ClCast.h72
-rw-r--r--src/gpu/cl/operators/ClConcatenate.cpp247
-rw-r--r--src/gpu/cl/operators/ClConcatenate.h79
-rw-r--r--src/gpu/cl/operators/ClConv2d.cpp292
-rw-r--r--src/gpu/cl/operators/ClConv2d.h152
-rw-r--r--src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp45
-rw-r--r--src/gpu/cl/operators/ClConvertFullyConnectedWeights.h57
-rw-r--r--src/gpu/cl/operators/ClCopy.cpp45
-rw-r--r--src/gpu/cl/operators/ClCopy.h58
-rw-r--r--src/gpu/cl/operators/ClCrop.cpp46
-rw-r--r--src/gpu/cl/operators/ClCrop.h65
-rw-r--r--src/gpu/cl/operators/ClDequantize.cpp53
-rw-r--r--src/gpu/cl/operators/ClDequantize.h58
-rw-r--r--src/gpu/cl/operators/ClDirectConv2d.cpp106
-rw-r--r--src/gpu/cl/operators/ClDirectConv2d.h82
-rw-r--r--src/gpu/cl/operators/ClElementwiseOperations.cpp92
-rw-r--r--src/gpu/cl/operators/ClElementwiseOperations.h165
-rw-r--r--src/gpu/cl/operators/ClElementwiseUnary.cpp116
-rw-r--r--src/gpu/cl/operators/ClElementwiseUnary.h175
-rw-r--r--src/gpu/cl/operators/ClFill.cpp45
-rw-r--r--src/gpu/cl/operators/ClFill.h57
-rw-r--r--src/gpu/cl/operators/ClFlatten.cpp45
-rw-r--r--src/gpu/cl/operators/ClFlatten.h66
-rw-r--r--src/gpu/cl/operators/ClFloor.cpp45
-rw-r--r--src/gpu/cl/operators/ClFloor.h55
-rw-r--r--src/gpu/cl/operators/ClFullyConnected.cpp496
-rw-r--r--src/gpu/cl/operators/ClFullyConnected.h138
-rw-r--r--src/gpu/cl/operators/ClGemm.cpp771
-rw-r--r--src/gpu/cl/operators/ClGemm.h137
-rw-r--r--src/gpu/cl/operators/ClGemmConv2d.cpp628
-rw-r--r--src/gpu/cl/operators/ClGemmConv2d.h185
-rw-r--r--src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp786
-rw-r--r--src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h155
-rw-r--r--src/gpu/cl/operators/ClGemmLowpOutputStage.cpp98
-rw-r--r--src/gpu/cl/operators/ClGemmLowpOutputStage.h88
-rw-r--r--src/gpu/cl/operators/ClLogicalNot.cpp45
-rw-r--r--src/gpu/cl/operators/ClLogicalNot.h55
-rw-r--r--src/gpu/cl/operators/ClMul.cpp60
-rw-r--r--src/gpu/cl/operators/ClMul.h103
-rw-r--r--src/gpu/cl/operators/ClPRelu.cpp57
-rw-r--r--src/gpu/cl/operators/ClPRelu.h64
-rw-r--r--src/gpu/cl/operators/ClPermute.cpp45
-rw-r--r--src/gpu/cl/operators/ClPermute.h58
-rw-r--r--src/gpu/cl/operators/ClPool2d.cpp101
-rw-r--r--src/gpu/cl/operators/ClPool2d.h72
-rw-r--r--src/gpu/cl/operators/ClQuantize.cpp53
-rw-r--r--src/gpu/cl/operators/ClQuantize.h60
-rw-r--r--src/gpu/cl/operators/ClReshape.cpp45
-rw-r--r--src/gpu/cl/operators/ClReshape.h55
-rw-r--r--src/gpu/cl/operators/ClScale.cpp60
-rw-r--r--src/gpu/cl/operators/ClScale.h66
-rw-r--r--src/gpu/cl/operators/ClSoftmax.cpp186
-rw-r--r--src/gpu/cl/operators/ClSoftmax.h95
-rw-r--r--src/gpu/cl/operators/ClSub.cpp47
-rw-r--r--src/gpu/cl/operators/ClSub.h80
-rw-r--r--src/gpu/cl/operators/ClTranspose.cpp45
-rw-r--r--src/gpu/cl/operators/ClTranspose.h55
-rw-r--r--src/gpu/cl/operators/ClWinogradConv2d.cpp306
-rw-r--r--src/gpu/cl/operators/ClWinogradConv2d.h126
64 files changed, 8047 insertions, 0 deletions
diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp
new file mode 100644
index 0000000000..6b36cc34b4
--- /dev/null
+++ b/src/gpu/cl/operators/ClActivation.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClActivation.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+
+#include "src/common/IOperator.h"
+#include "src/common/utils/LegacySupport.h"
+#include "src/gpu/cl/ClContext.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ auto k = std::make_unique<kernels::ClActivationKernel>();
+ k->configure(compile_context, src, dst, act_info);
+ _kernel = std::move(k);
+}
+
+Status ClActivation::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ return kernels::ClActivationKernel::validate(src, dst, act_info);
+}
+} // namespace opencl
+
+namespace gpu
+{
+namespace opencl
+{
+std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+{
+ TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
+ TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
+ auto info = detail::convert_to_activation_info(act);
+
+ if(is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+ {
+ return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
+ }
+
+ auto act_op = std::make_unique<arm_compute::opencl::ClActivation>();
+ act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info);
+
+ auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
+ if(op == nullptr)
+ {
+ ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
+ return std::make_tuple(nullptr, StatusCode::OutOfMemory);
+ }
+ op->set_internal_operator(std::move(act_op));
+
+ return std::make_tuple(op, StatusCode::Success);
+}
+} // namespace opencl
+} // namespace gpu
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h
new file mode 100644
index 0000000000..75b38e8a00
--- /dev/null
+++ b/src/gpu/cl/operators/ClActivation.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ACTIVATION_H
+#define ARM_COMPUTE_CL_ACTIVATION_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClActivationKernel */
+class ClActivation : public IClOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+ * @param[out] dst Destination tensor info. Data type supported: same as @p src
+ * @param[in] activation_info Activation layer parameters.
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClActivation::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ACTIVATION_H */
diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp
new file mode 100644
index 0000000000..e1a013a6b5
--- /dev/null
+++ b/src/gpu/cl/operators/ClAdd.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClAdd.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+ ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+ auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
+ k->configure(compile_context, ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
+ _kernel = std::move(k);
+}
+
+Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
+ ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+ return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h
new file mode 100644
index 0000000000..d99f983ed0
--- /dev/null
+++ b/src/gpu/cl/operators/ClAdd.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ADD_H
+#define ARM_COMPUTE_CL_ADD_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run arithmetic addition
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @note The function performs an arithmetic addition between two tensors.
+ */
+class ClAdd : public IClOperator
+{
+public:
+ /** Configure function for a given list of arguments.
+ *
+ * Valid configurations (src1,src2) -> dst :
+ *
+ * - (U8,U8) -> U8
+ * - (U8,U8) -> S16
+ * - (S16,U8) -> S16
+ * - (U8,S16) -> S16
+ * - (S16,S16) -> S16
+ * - (S32,S32) -> S32
+ * - (F16,F16) -> F16
+ * - (F32,F32) -> F32
+ * - (QASYMM8,QASYMM8) -> QASYMM8
+ * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+ * - (QSYMM16,QSYMM16) -> QSYMM16
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @param[in] policy Policy to use to handle overflow.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClAdd::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ADD_H */
diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp
new file mode 100644
index 0000000000..8911d208a7
--- /dev/null
+++ b/src/gpu/cl/operators/ClCast.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClCast.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClCastKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+{
+ auto k = std::make_unique<kernels::ClCastKernel>();
+ k->configure(compile_context, src, dst, policy);
+ _kernel = std::move(k);
+}
+
+Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+ return kernels::ClCastKernel::validate(src, dst, policy);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h
new file mode 100644
index 0000000000..1b67ff7c8e
--- /dev/null
+++ b/src/gpu/cl/operators/ClCast.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CAST_H
+#define ARM_COMPUTE_CL_CAST_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClCastKernel */
+class ClCast : public IClOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @note Input data type must be different than output data type.
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |src |dst |
+ * |:--------------|:--------------------------------------|
+ * |U8 | S8, U16, S16, U32, S32, F16, F32 |
+ * |U16 | U8, S8, S16, U32, S32, F16, F32 |
+ * |S16 | U8, S8, U16, U32, S32, F16, F32 |
+ * |U32 | U8, S8, U16, S16, S32, F16, F32 |
+ * |S32 | U8, S8, U16, S16, U32, F16, F32 |
+ * |F16 | U8, S8, U16, S16, U32, F32 |
+ * |F32 | U8, S8, U16, S16, U32, F16 |
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[out] dst The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[in] policy Conversion policy.
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClCast::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CAST_H */
diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp
new file mode 100644
index 0000000000..731d9b5054
--- /dev/null
+++ b/src/gpu/cl/operators/ClConcatenate.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClConcatenate.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h"
+#include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h"
+#include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h"
+#include "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
+#include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
+#include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis)
+{
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ _axis = axis;
+ _num_inputs = src_vector.size();
+
+ TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
+ std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
+ std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t;
+ });
+
+ // dst auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
+ ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
+
+ unsigned int offset = 0;
+ switch(_axis)
+ {
+ case Window::DimX:
+ {
+ switch(_num_inputs)
+ {
+ case 2:
+ {
+ // Configure WidthConcatenate2Tensors kernel
+ auto kernel = std::make_unique<kernels::ClWidthConcatenate2TensorsKernel>();
+ kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), dst);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ case 4:
+ {
+ // Configure WidthConcatenate4Tensors kernel
+ auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
+ kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ default:
+ {
+ // Configure generic case WidthConcatenate kernels
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
+ kernel->configure(compile_context, src_vector.at(i), offset, dst);
+ offset += src_vector.at(i)->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
+ break;
+ }
+ }
+ break;
+ }
+ case Window::DimY:
+ {
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
+ kernel->configure(compile_context, src_vector.at(i), offset, dst);
+ offset += src_vector.at(i)->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
+ break;
+ }
+ case Window::DimZ:
+ {
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
+ kernel->configure(compile_context, src_vector.at(i), offset, dst);
+ offset += src_vector.at(i)->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
+ break;
+ }
+ case 3:
+ {
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
+ kernel->configure(compile_context, src_vector.at(i), offset, dst);
+ offset += src_vector.at(i)->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Axis not supported");
+ }
+}
+
+Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(dst == nullptr);
+ const unsigned int num_inputs = src_vector.size();
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
+
+ unsigned int offset = 0;
+ switch(axis)
+ {
+ case Window::DimX:
+ {
+ switch(num_inputs)
+ {
+ case 2:
+ // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
+ break;
+ case 4:
+ // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
+ break;
+ default:
+ // Validate generic case of WidthConcatenate kernel
+ for(const auto &src : src_vector)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
+ offset += src->dimension(axis);
+ }
+ break;
+ }
+ break;
+ }
+ case Window::DimY:
+ {
+ for(const auto &src : src_vector)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
+ offset += src->dimension(axis);
+ }
+ break;
+ }
+ case Window::DimZ:
+ {
+ for(const auto &src : src_vector)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
+ offset += src->dimension(axis);
+ }
+ break;
+ }
+ case 3:
+ {
+ for(const auto &src : src_vector)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
+ offset += src->dimension(axis);
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Axis not supported");
+ }
+
+ if(dst->total_size() != 0)
+ {
+ TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
+ }
+
+ return Status{};
+}
+
+void ClConcatenate::run(ITensorPack &tensors)
+{
+ if(tensors.empty())
+ {
+ ARM_COMPUTE_ERROR("No inputs provided");
+ }
+
+ if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
+ {
+ ARM_COMPUTE_ERROR("Configured with different number of inputs");
+ }
+
+ if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
+ {
+ ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
+ CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
+ }
+ else
+ {
+ int i = 0;
+ for(auto &k : _concat_kernels)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
+ pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
+ CLScheduler::get().enqueue_op(*k, pack, true);
+ ++i;
+ }
+ }
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h
new file mode 100644
index 0000000000..de0cf84d2c
--- /dev/null
+++ b/src/gpu/cl/operators/ClConcatenate.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONCATENATE_H
+#define ARM_COMPUTE_CLCONCATENATE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
+ *
+ * -# @ref kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0).
+ * -# @ref kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1).
+ * -# @ref kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2).
+ * -# @ref kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3).
+ */
+class ClConcatenate : public IClOperator
+{
+public:
+ ClConcatenate() = default;
+ /** Initialise the kernel's inputs vector and dst.
+ *
+ * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis.
+ * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel,
+ * @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel.
+ *
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] src_vector The vectors containing all the tensors info to concatenate. Data types supported: All
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src_vector.
+ * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+ */
+ void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClConcatenate::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+
+private:
+ std::vector<std::unique_ptr<IClKernel>> _concat_kernels{};
+ unsigned int _num_inputs{ 0 };
+ unsigned int _axis{ 0 };
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CONCATENATE_H */
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
new file mode 100644
index 0000000000..c91a4831a8
--- /dev/null
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClConv2d.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
+#include "src/gpu/cl/operators/ClGemmConv2d.h"
+#include "src/gpu/cl/operators/ClWinogradConv2d.h"
+
+#include <memory>
+
+namespace
+{
+/** Get the suitable kernel size for using direct convolution method with NHWC data layout.
+ *
+ * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function
+ *
+ * @param[in] gpu_target GPU target
+ *
+ * @return the suitable kernel size for using direct convolution method with NHWC data layout
+ */
+size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
+{
+ switch(gpu_target)
+ {
+ case arm_compute::GPUTarget::G76:
+ case arm_compute::GPUTarget::G77:
+ case arm_compute::GPUTarget::G78:
+ return 5;
+ case arm_compute::GPUTarget::G71:
+ case arm_compute::GPUTarget::G72:
+ case arm_compute::GPUTarget::MIDGARD:
+ case arm_compute::GPUTarget::BIFROST:
+ return 7;
+ default:
+ return 5;
+ }
+}
+} // namespace
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClConv2d::ClConv2d()
+ : _operator()
+{
+}
+
+ClConv2d::~ClConv2d() = default;
+
+void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
+
+ switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
+ {
+ case ConvolutionMethod::WINOGRAD:
+ {
+ ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
+ auto f = std::make_unique<ClWinogradConv2d>();
+ f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math);
+ _operator = std::move(f);
+ break;
+ }
+ case ConvolutionMethod::DIRECT:
+ {
+ ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
+ auto f = std::make_unique<ClDirectConv2d>();
+ f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
+ _operator = std::move(f);
+ break;
+ }
+ case ConvolutionMethod::GEMM:
+ {
+ auto f = std::make_unique<ClGemmConv2d>();
+ f->configure(compile_context, src, weights, biases, dst, conv2d_info, weights_info);
+ _operator = std::move(f);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+ _aux_mem = _operator->workspace();
+}
+
+Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
+ {
+ case ConvolutionMethod::WINOGRAD:
+ {
+ //Validate Winograd
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
+ ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math));
+ break;
+ }
+ case ConvolutionMethod::DIRECT:
+ {
+ // Validate direct convolution layer
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
+ ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+ break;
+ }
+ case ConvolutionMethod::GEMM:
+ {
+ // Validate gemm-based convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+
+ return Status{};
+}
+
+ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+ const WeightsInfo &weights_info, const GPUTarget gpu_target)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
+ ARM_COMPUTE_UNUSED(weights_info);
+
+ const PadStrideInfo conv_info = conv2d_info.conv_info;
+ const ActivationLayerInfo act_info = conv2d_info.act_info;
+ const Size2D dilation = conv2d_info.dilation;
+ bool enable_fast_math = conv2d_info.enable_fast_math;
+
+ const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+
+ /* Input spatial dims, kernel size, IFM/OFM, conv info*/
+ using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
+ using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+
+ const std::vector<ConfigurationMethod> known_configs =
+ {
+ // Alexnet
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+ // VGG16 / VGG19
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+ // Mobilenet 224
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+ // Mobilenet 160
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+ // Mobilenet 224
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+ // Mobilenet 160
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+ };
+
+ const auto find_config = [&](ConfigurationMethod c)
+ {
+ const ConvolutionConfiguration config = c.first;
+ const PadStrideInfo info = std::get<3>(config);
+ const DataLayout data_layout = std::get<4>(config);
+
+ return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
+ && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
+ && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == src->data_layout());
+ };
+
+ std::vector<ConfigurationMethod>::const_iterator found;
+ if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+ {
+ return (*found).second;
+ }
+
+ if(dilation != Size2D(1U, 1U))
+ {
+ return ConvolutionMethod::GEMM;
+ }
+ else
+ {
+ if(src->data_layout() == DataLayout::NCHW)
+ {
+ // SRGAN
+ if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
+ && (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
+ {
+ return ConvolutionMethod::DIRECT;
+ }
+ if((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
+ {
+ return ConvolutionMethod::FFT;
+ }
+ if(src->dimension(idx_c) < 16)
+ {
+ return ConvolutionMethod::GEMM;
+ }
+ return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+ }
+ else
+ {
+ const bool is_direct_valid = bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+ const bool is_wino_valid = bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
+ const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
+
+ // SRGAN case
+ if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
+ && is_direct_valid)
+ {
+ return ConvolutionMethod::DIRECT;
+ }
+
+ // Floating-point case: GeMM/Direct/Winograd
+ if(is_data_type_float(src->data_type()))
+ {
+ const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+ const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16;
+ const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U);
+
+ // Run Winograd if valid and IFM >= 16
+ if(is_wino_valid && is_ifm_ge_16)
+ {
+ return ConvolutionMethod::WINOGRAD;
+ }
+ // Run Direct for Large kernel size
+ if(is_large_kernel_sz && is_ifm_ge_16 && is_direct_valid && is_ifm_gt_ofm)
+ {
+ return ConvolutionMethod::DIRECT;
+ }
+
+ // Default case
+ return ConvolutionMethod::GEMM;
+ }
+
+ // Generic case for quantized. Only GeMM
+ return ConvolutionMethod::GEMM;
+ }
+ }
+}
+
+void ClConv2d::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+ _operator->run(tensors);
+}
+
+void ClConv2d::prepare(ITensorPack &tensors)
+{
+ _operator->prepare(tensors);
+}
+
+experimental::MemoryRequirements ClConv2d::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h
new file mode 100644
index 0000000000..1c3a81c77a
--- /dev/null
+++ b/src/gpu/cl/operators/ClConv2d.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONV2D_H
+#define ARM_COMPUTE_CLCONV2D_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref opencl::ClGemmConv2d
+ * -# @ref opencl::ClWinogradConv2d
+ * -# @ref opencl::ClDirectConv2d
+ * -# @ref CLFFTConvolutionLayer
+ *
+ * The function selects one of the algorithms mentioned above based on:
+ * - The size of the kernel
+ * - Number of src/dst feature maps
+ * - Amount of memory needed
+ *
+ * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed.
+ *
+ * FP32 Algorithm| Filter Size | Input/Output feature maps |
+ * --------------|-------------------------------------------------------------|-------------------------------------------|
+ * Winograd | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7 | Input channels is greater than 3 |
+ * FFT | Squared kernels and greater than 9x9 | Input feature maps > Output feature maps |
+ * DirectConv | 9x9 | |
+ * GEMM | Any size | |
+ *
+ * Winograd 5x5 requires fast maths enabled.
+ *
+ * FP16 Algorithm| Filter Size | Input/Output feature maps |
+ * --------------|----------------------------|-------------------------------------------|
+ * Winograd | 3x3 1x3 3x1 5x1 1x5 5x5 | Input channels is greater than 3 |
+ * FFT | Not supported | |
+ * DirectConv | 9x9 | |
+ * GEMM | Any size | |
+ *
+ * Winograd FP16 requires fast maths enabled.
+ *
+ */
+class ClConv2d : public IClOperator
+{
+public:
+ /** Default constructor */
+ ClConv2d();
+ /** Default Destructor */
+ ~ClConv2d();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ ClConv2d(const ClConv2d &) = delete;
+ /** Default move constructor */
+ ClConv2d(ClConv2d &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ ClConv2d &operator=(const ClConv2d &) = delete;
+ /** Default move assignment operator */
+ ClConv2d &operator=(ClConv2d &&) = default;
+ /** Set the src and dst tensors.
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. 3 lower dimensions represent a single src [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of srcs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Same as @p src, except for src of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[out] dst Destination tensor info. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+ * Data types supported: Same as @p src.
+ * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src.
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+ const WeightsInfo &weights_info = WeightsInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d
+ *
+ * Similar to ClConv2d::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+ const WeightsInfo &weights_info = WeightsInfo());
+ /** Static function to check if given info will return the convolution called by @ref ClConv2d
+ *
+ * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of srcs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+ * Data types supported: Same as @p src.
+ * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel.
+ * @param[in] gpu_target Specifies the @p GPUTarget.
+ *
+ * @return the Convolution Method Hint
+ */
+ static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+ const WeightsInfo &weights_info, const GPUTarget gpu_target);
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ std::unique_ptr<IClOperator> _operator;
+ experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCONV2D_H */
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000000..61e33f2fdb
--- /dev/null
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+{
+ auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>();
+ k->configure(compile_context, src, dst, original_src_shape, data_layout);
+ _kernel = std::move(k);
+}
+
+Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+{
+ return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
new file mode 100644
index 0000000000..2794eb17b0
--- /dev/null
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H
+#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClConvertFullyConnectedWeightsKernel */
+class ClConvertFullyConnectedWeights : public IClOperator
+{
+public:
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The src tensor info. Data types supported: All.
+ * @param[in] dst The dst tensor info. Data types supported: Same as @p src
+ * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
+ * @param[in] data_layout The data layout the weights have been trained in.
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClConvertFullyConnectedWeights::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H */
diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp
new file mode 100644
index 0000000000..c1a9f264b6
--- /dev/null
+++ b/src/gpu/cl/operators/ClCopy.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClCopy.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClCopyKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClCopy::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
+{
+ auto k = std::make_unique<kernels::ClCopyKernel>();
+ k->configure(compile_context, src, dst, dst_window);
+ _kernel = std::move(k);
+}
+
+Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window)
+{
+ return kernels::ClCopyKernel::validate(src, dst, dst_window);
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h
new file mode 100644
index 0000000000..9b427f9675
--- /dev/null
+++ b/src/gpu/cl/operators/ClCopy.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_COPY_H
+#define ARM_COMPUTE_CL_COPY_H
+
+#include "arm_compute/core/Window.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClCopyKernel */
+class ClCopy : public IClOperator
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: All.
+ * @param[out] dst Output tensor info. Data types supported: Same as @p src.
+ * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
+ *
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClCopy::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_COPY_H */
diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp
new file mode 100644
index 0000000000..a6a1c8b103
--- /dev/null
+++ b/src/gpu/cl/operators/ClCrop.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClCrop.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClCropKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
+ Window *dst_window)
+{
+ auto k = std::make_unique<kernels::ClCropKernel>();
+ k->configure(compile_context, src, dst, start, end, batch_index, extrapolation_value, dst_window);
+ _kernel = std::move(k);
+}
+
+Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+{
+ return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window);
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h
new file mode 100644
index 0000000000..1cf1c9bff4
--- /dev/null
+++ b/src/gpu/cl/operators/ClCrop.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CROP_H
+#define ARM_COMPUTE_CL_CROP_H
+
+#include "arm_compute/core/Window.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClCropKernel */
+class ClCrop : public IClOperator
+{
+public:
+ /** Initialise the function's source and destination.
+ *
+ * @note Supported tensor rank: up to 4
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data type supported: All. Data layouts supported: NHWC.
+ * @param[out] dst Destination tensor info. Data type supported: F32
+ * @param[in] start Coordinates of where to start cropping the image.
+ * @param[in] end Coordinates of where to end cropping the image.
+ * @param[in] batch_index Fourth dimension index of the 3D image to crop in @p src.
+ * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
+ * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
+ Window *dst_window = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClCrop::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
+ Window *dst_window = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CROP_H */
diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp
new file mode 100644
index 0000000000..dbaa5f67df
--- /dev/null
+++ b/src/gpu/cl/operators/ClDequantize.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClDequantize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClDequantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClDequantizeKernel>();
+ k->configure(compile_context, src, dst);
+ _kernel = std::move(k);
+}
+
+Status ClDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClDequantizeKernel::validate(src, dst);
+}
+
+void ClDequantize::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDequantize.h b/src/gpu/cl/operators/ClDequantize.h
new file mode 100644
index 0000000000..ccaac2cd49
--- /dev/null
+++ b/src/gpu/cl/operators/ClDequantize.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEQUANTIZE_H
+#define ARM_COMPUTE_CL_DEQUANTIZE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClDequantizeKernel that dequantizes an input tensor */
+class ClDequantize : public IClOperator
+{
+public:
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+ * @param[out] dst Destination tensor info with the same dimensions of @p src. Data type supported: F16/F32.
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClDequantize::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited method overridden
+ void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEQUANTIZE_H */
diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp
new file mode 100644
index 0000000000..50e63beedc
--- /dev/null
+++ b/src/gpu/cl/operators/ClDirectConv2d.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+#include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace
+{
+ITensorPack select_activation_src_dst(ITensorPack &tensors)
+{
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST));
+ pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST));
+ return pack;
+}
+} // namespace
+
+void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+ const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+
+ // Configure direct convolution kernel
+ const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo();
+ auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
+ k->set_target(CLScheduler::get().target());
+ k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info);
+ _direct_conv_kernel = std::move(k);
+
+ // Configure border handler
+ PixelValue zero_value(0.f);
+ if(is_data_type_quantized_asymmetric(src->data_type()))
+ {
+ zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+ }
+ auto b = std::make_unique<CLFillBorderKernel>();
+ b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
+ _src_border_handler = std::move(b);
+
+ // Fused activation is currently supported for NHWC and floating point types
+ if(act_info.enabled() && !conv2d_act_info.enabled())
+ {
+ auto a = std::make_unique<kernels::ClActivationKernel>();
+ a->configure(compile_context, dst, dst, act_info);
+ _activation_kernel = std::move(a);
+ }
+
+ // Tune kernels
+ CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
+}
+
+Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), CLScheduler::get().target()));
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
+ }
+ return Status{};
+}
+
+void ClDirectConv2d::run(ITensorPack &tensors)
+{
+ // Run border handler
+ CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false);
+ // Run direct convolution
+ CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
+ // Run activation kernel
+ if(_activation_kernel)
+ {
+ auto act_pack = select_activation_src_dst(tensors);
+ CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
+ }
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h
new file mode 100644
index 0000000000..85365b76ff
--- /dev/null
+++ b/src/gpu/cl/operators/ClDirectConv2d.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref opencl::ClDirectConv2d
+ */
+class ClDirectConv2d : public IClOperator
+{
+public:
+ ClDirectConv2d() = default;
+ /** Set the src and dst tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of srcs.
+ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[out] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+ * Data types supported: Same as @p src.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ *
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClDirectConv2d::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited method overridden
+ void run(ITensorPack &tensors) override;
+
+private:
+ std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
+ std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
+ std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */ \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp
new file mode 100644
index 0000000000..4e4cd5ae9d
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClElementwiseOperations.h"
+
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ auto k = std::make_unique<kernels::ClArithmeticKernel>();
+ k->configure(compile_context, ArithmeticOperation::DIV, src1, src2, dst, act_info);
+ _kernel = std::move(k);
+}
+
+Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info);
+}
+
+void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ auto k = std::make_unique<kernels::ClArithmeticKernel>();
+ k->configure(compile_context, ArithmeticOperation::MAX, src1, src2, dst, act_info);
+ _kernel = std::move(k);
+}
+
+Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info);
+}
+
+void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ auto k = std::make_unique<kernels::ClArithmeticKernel>();
+ k->configure(compile_context, ArithmeticOperation::MIN, src1, src2, dst, act_info);
+ _kernel = std::move(k);
+}
+
+Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info);
+}
+
+void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ auto k = std::make_unique<kernels::ClArithmeticKernel>();
+ k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
+ _kernel = std::move(k);
+}
+
+Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
+}
+
+void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ auto k = std::make_unique<kernels::ClArithmeticKernel>();
+ k->configure(compile_context, ArithmeticOperation::POWER, src1, src2, dst, act_info);
+ _kernel = std::move(k);
+}
+
+Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h
new file mode 100644
index 0000000000..304b250d66
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseOperations.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
+#define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for division
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs an arithmetic division between two tensors.
+ */
+class ClElementwiseDivision : public IClOperator
+{
+public:
+ /** Configure function for a given list of arguments.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src1 First source tensor info. Data types supported: F16/F32.
+ * @param[in] src2 Second source tensor info. same as @p src1.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClElementwiseDivision::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class ClElementwiseMax : public IClOperator
+{
+public:
+ /** Configure function for a given list of arguments.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+ * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClElementwiseMax::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class ClElementwiseMin : public IClOperator
+{
+public:
+ /** Configure function for a given list of arguments.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+ * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClElementwiseMin::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32.
+ * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
+ */
+class ClElementwiseSquaredDiff : public IClOperator
+{
+public:
+ /** Configure function for a given list of arguments.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+ * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClElementwiseSquaredDiff::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
+ */
+class ClElementwisePower : public IClOperator
+{
+public:
+ /** Configure function for a given list of arguments.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src1 First source tensor info. Data types supported: F16/F32.
+ * @param[in] src2 Second source tensor info. Data types supported: F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported:F16/F32.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClElementwisePower::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H */
diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp
new file mode 100644
index 0000000000..24a603e8c3
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClElementwiseUnary.h"
+
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClRsqrt::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+ k->configure(compile_context, src, dst, ElementWiseUnary::RSQRT);
+ _kernel = std::move(k);
+}
+
+Status ClRsqrt::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::RSQRT);
+}
+
+void ClExp::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+ k->configure(compile_context, src, dst, ElementWiseUnary::EXP);
+ _kernel = std::move(k);
+}
+
+Status ClExp::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::EXP);
+}
+
+void ClNeg::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+ k->configure(compile_context, src, dst, ElementWiseUnary::NEG);
+ _kernel = std::move(k);
+}
+
+Status ClNeg::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::NEG);
+}
+
+void ClSin::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+ k->configure(compile_context, src, dst, ElementWiseUnary::SIN);
+ _kernel = std::move(k);
+}
+
+Status ClSin::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::SIN);
+}
+
+void ClAbs::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+ k->configure(compile_context, src, dst, ElementWiseUnary::ABS);
+ _kernel = std::move(k);
+}
+
+Status ClAbs::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ABS);
+}
+
+void ClLog::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+ k->configure(compile_context, src, dst, ElementWiseUnary::LOG);
+ _kernel = std::move(k);
+}
+
+Status ClLog::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOG);
+}
+
+void ClRound::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+ k->configure(compile_context, src, dst, ElementWiseUnary::ROUND);
+ _kernel = std::move(k);
+}
+
+Status ClRound::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ROUND);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClElementwiseUnary.h b/src/gpu/cl/operators/ClElementwiseUnary.h
new file mode 100644
index 0000000000..a23b789ab5
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseUnary.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H
+#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to perform inverse square root on an src tensor. */
+class ClRsqrt : public IClOperator
+{
+public:
+ /** Initialize the function
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClRsqrt::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to perform exponential on an src tensor. */
+class ClExp : public IClOperator
+{
+public:
+ /** Initialize the function
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClExp::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to negate an src tensor. */
+class ClNeg : public IClOperator
+{
+public:
+ /** Initialize the function
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClNeg::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to calculate sine of an src tensor. */
+class ClSin : public IClOperator
+{
+public:
+ /** Initialize the function
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClSin::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to perform elementwise log on an src tensor. */
+class ClLog : public IClOperator
+{
+public:
+ /** Initialize the function
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClLog::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to get the absolute value of an src tensor. */
+class ClAbs : public IClOperator
+{
+public:
+ /** Initialize the function
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClAbs::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to get the round (to the nearest even) value of an src tensor. */
+class ClRound : public IClOperator
+{
+public:
+ /** Initialize the function
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClRound::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H */
diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp
new file mode 100644
index 0000000000..9e006c1649
--- /dev/null
+++ b/src/gpu/cl/operators/ClFill.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFill.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClFillKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
+{
+ auto k = std::make_unique<kernels::ClFillKernel>();
+ k->configure(compile_context, tensor, constant_value, dst_window);
+ _kernel = std::move(k);
+}
+
+Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
+{
+ return kernels::ClFillKernel::validate(tensor, constant_value, dst_window);
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h
new file mode 100644
index 0000000000..c9289b2b95
--- /dev/null
+++ b/src/gpu/cl/operators/ClFill.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FILL_H
+#define ARM_COMPUTE_CL_FILL_H
+
+#include "arm_compute/core/Window.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClFillKernel */
+class ClFill : public IClOperator
+{
+public:
+ /** Initialise the kernel's tensor and filling value
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] tensor Source tensor info. Supported data types: All.
+ * @param[in] constant_value The value used to fill the planes of the tensor
+ * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr.
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClFill::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FILL_H */
diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp
new file mode 100644
index 0000000000..3283454fd6
--- /dev/null
+++ b/src/gpu/cl/operators/ClFlatten.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFlatten.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClReshapeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClFlatten::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClReshapeKernel>();
+ k->configure(compile_context, src, dst);
+ _kernel = std::move(k);
+}
+
+Status ClFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClReshapeKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFlatten.h b/src/gpu/cl/operators/ClFlatten.h
new file mode 100644
index 0000000000..d2ce3b701d
--- /dev/null
+++ b/src/gpu/cl/operators/ClFlatten.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FLATTEN_H
+#define ARM_COMPUTE_CL_FLATTEN_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to flatten a given input */
+class ClFlatten : public IClOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |src |dst |
+ * |:--------------|:--------------|
+ * |All |All |
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor to flatten with at least 3 dimensions.
+ * The dimensions above the third will be interpreted as batches. Data types supported: All
+ * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where:
+ * w = width input tensor, h = height input tensor and d = depth input tensor.
+ * Data type supported: same as @p src
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClFlatten::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FLATTEN_H */
diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp
new file mode 100644
index 0000000000..866bff2fad
--- /dev/null
+++ b/src/gpu/cl/operators/ClFloor.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFloor.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClFloorKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClFloor::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClFloorKernel>();
+ k->configure(compile_context, src, dst);
+ _kernel = std::move(k);
+}
+
+Status ClFloor::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClFloorKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFloor.h b/src/gpu/cl/operators/ClFloor.h
new file mode 100644
index 0000000000..746147335e
--- /dev/null
+++ b/src/gpu/cl/operators/ClFloor.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FLOOR_H
+#define ARM_COMPUTE_CL_FLOOR_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClFloorKernel */
+class ClFloor : public IClOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: F16/F32.
+ * @param[in] dst Destination tensor info. Data type supported: same as @p src
+ */
+ void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClFloor::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FLOOR_H */
diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp
new file mode 100644
index 0000000000..8b7e336c9f
--- /dev/null
+++ b/src/gpu/cl/operators/ClFullyConnected.cpp
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFullyConnected.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
+#include "src/gpu/cl/operators/ClFlatten.h"
+#include "src/gpu/cl/operators/ClGemm.h"
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
+#include "src/gpu/cl/operators/ClTranspose.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+
+#include "support/Cast.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::experimental;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst,
+ GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
+{
+ gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_output_stage.gemmlowp_offset = 0;
+ gemmlowp_output_stage.gemmlowp_multiplier = 0;
+ gemmlowp_output_stage.gemmlowp_shift = 0;
+
+ const auto data_type = src.data_type();
+
+ // Configure output stage for quantized case
+ if(is_data_type_quantized_asymmetric(data_type))
+ {
+ const QuantizationInfo oq_info = dst.quantization_info();
+ const UniformQuantizationInfo iq_unif = src.quantization_info().uniform();
+ const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform();
+ const UniformQuantizationInfo oq_unif = oq_info.uniform();
+
+ const auto output_quant_info = (dst.total_size() == 0) ? iq_unif : oq_unif;
+
+ const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
+ PixelValue type_min{};
+ PixelValue type_max{};
+ std::tie(type_min, type_max) = get_min_max(data_type);
+
+ if(activation_info.enabled())
+ {
+ std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
+ }
+
+ // Set the GEMMLowp output stage info
+ gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+ gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+ gemmlowp_output_stage.gemmlowp_shift = output_shift;
+ gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
+ gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
+ type_min.get(gemmlowp_output_stage.gemmlowp_min_bound);
+ type_max.get(gemmlowp_output_stage.gemmlowp_max_bound);
+ }
+
+ return Status{};
+}
+
+Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info)
+{
+ GEMMLowpOutputStageInfo gemmlowp_output_stage;
+ ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
+
+ const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+ false, // is_b_reshaped
+ true, // reshape_b_only_on_first_run
+ 0, // depth_output_gemm3d
+ false, // reinterpret_input_as_3d
+ fc_info.retain_internal_weights, // retain_internal_weights
+ gemmlowp_output_stage, // gemmlowp_output_stage
+ fc_info.fp_mixed_precision, // fp_mixed_precision
+ false, // fast_math
+ true, // broadcast_bias
+ ActivationLayerInfo()); // activation_info
+
+ if(is_data_type_quantized_asymmetric(src.data_type()))
+ {
+ const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate src and weights offset
+ const QuantizationInfo src_quantization_info(iq_info.scale, -iq_info.offset);
+ const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
+
+ // Validate gemmlowp function
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info),
+ &weights.clone()->set_quantization_info(weights_quantization_info),
+ bias,
+ &dst,
+ gemm_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&src, &weights, bias, &dst, 1.f, 1.f, gemm_info));
+ }
+
+ return Status{};
+}
+} // namespace
+
+ClFullyConnected::ClFullyConnected()
+ : _convert_weights(nullptr),
+ _flatten(nullptr),
+ _reshape_weights(nullptr),
+ _mm_gemm(nullptr),
+ _mm_gemmlowp(nullptr),
+ _aux_mem(Count)
+{
+}
+
+ClFullyConnected::~ClFullyConnected() = default;
+
+void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+ const FullyConnectedLayerInfo &fc_info)
+{
+ GEMMLowpOutputStageInfo gemmlowp_output_stage;
+ construct_gemmlowp_output_stage(*src, *weights, *dst, gemmlowp_output_stage, fc_info.activation_info);
+
+ const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+ false, // is_b_reshaped
+ true, // reshape_b_only_on_first_run
+ 0, // depth_output_gemm3d
+ false, // reinterpret_input_as_3d
+ fc_info.retain_internal_weights, // retain_internal_weights
+ gemmlowp_output_stage, // gemmlowp_output_stage
+ fc_info.fp_mixed_precision, // fp_mixed_precision
+ false, // fast_math
+ true, // broadcast_bias
+ fc_info.activation_info, // activation_info
+ fc_info.constant_weights); // constant_weights
+
+ if(_is_quantized)
+ {
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo src_quantization_info = src->quantization_info();
+ const QuantizationInfo weights_quantization_info = weights->quantization_info();
+
+ TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
+ TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
+
+ src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
+ weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+
+ // Configure gemmlowp function
+ _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
+ _mm_gemmlowp->configure(compile_context, &src_info, &weights_info, bias, dst, gemm_info);
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_gemm = std::make_unique<ClGemm>();
+ _mm_gemm->configure(compile_context, src, weights, bias, dst, 1.f, 1.f, gemm_info);
+ }
+}
+
+void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+ const FullyConnectedLayerInfo &fc_info)
+{
+ ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+
+ // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+ // Initialize output tensor for flatten
+ _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW);
+
+ // Configure flatten kernel
+ _flatten = std::make_unique<ClFlatten>();
+ _flatten->configure(compile_context, src, &_flattened_src);
+
+ // Configure matrix multiply kernel
+ configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info);
+}
+
+void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+ const FullyConnectedLayerInfo &fc_info)
+{
+ ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
+
+ // Configure matrix multiply kernel
+ configure_mm(compile_context, src, weights, bias, dst, fc_info);
+}
+
+void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(ClFullyConnected::validate(src, weights, biases, dst, fc_info));
+
+ _are_weights_converted = true;
+ _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ _is_fc_after_conv = true;
+ _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+ _is_prepared = fc_info.retain_internal_weights;
+ _weights_to_use = TensorInfo(*weights);
+ _weights_to_use_idx = ACL_SRC_1;
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = dst->dimension(1) > 1;
+ if(is_batched_fc_layer)
+ {
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
+ src->tensor_shape().cend(),
+ dst->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ _is_fc_after_conv = src->num_dimensions() > 1;
+ }
+
+ ITensorInfo *weights_used = weights;
+
+ // Reshape weights if needed
+ if(!_are_weights_reshaped)
+ {
+ // Reshape the weights
+ _reshape_weights = std::make_unique<ClTranspose>();
+ _reshape_weights->configure(compile_context, weights, &_reshaped_weights);
+ weights_used = &_reshaped_weights;
+ _weights_to_use_idx = offset_int_vec(TransposedWeights);
+ }
+
+ // Convert weights if needed
+ if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+ {
+ // Convert weights
+ _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>();
+ _convert_weights->configure(compile_context,
+ weights_used,
+ &_converted_weights,
+ src->tensor_shape(),
+ fc_info.weights_trained_layout);
+
+ weights_used = &_converted_weights;
+ _weights_to_use_idx = offset_int_vec(ConvertedWeights);
+ _are_weights_converted = false;
+ }
+
+ if(_is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info);
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ configure_fc_fc(compile_context, src, weights_used, biases, dst, fc_info);
+ }
+ // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion)
+ _weights_to_use = *weights_used;
+
+ // Set auxiliary memory requirements
+ auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
+ for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+ {
+ _aux_mem[i] = gemm_mem_req[i];
+ }
+ if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
+ {
+ // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
+ _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size());
+ _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size());
+ }
+ else
+ {
+ // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
+ const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
+ const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
+
+ _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), transposed_wei_lft, _reshaped_weights.total_size());
+ _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), converted_wei_lft, _converted_weights.total_size());
+ }
+ _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+}
+
+Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
+ && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+ ARM_COMPUTE_RETURN_ERROR_ON(!fc_info.constant_weights && (!fc_info.are_weights_reshaped || fc_info.transpose_weights));
+
+ bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ bool is_fc_after_conv = true;
+
+ const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW));
+ const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+ const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensorInfo *src_to_use = src;
+ const ITensorInfo *weights_to_use = weights;
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = dst->dimension(1) > 1;
+ if(is_batched_fc_layer)
+ {
+ is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
+ src->tensor_shape().cend(),
+ dst->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ is_fc_after_conv = src->num_dimensions() > 1;
+ }
+
+ if(!weights_reshaped)
+ {
+ // Validate reshape weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights));
+ weights_to_use = &reshaped_weights;
+ }
+
+ if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+ {
+ // Validate convert weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use,
+ &converted_weights,
+ src->tensor_shape(),
+ fc_info.weights_trained_layout));
+ weights_to_use = &converted_weights;
+ }
+
+ if(is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+
+ // Validate flatten kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src));
+ src_to_use = &flatten_src;
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
+ }
+
+ // Validate matrix multiply kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*src_to_use, *weights_to_use, biases, *dst, fc_info));
+
+ return Status{};
+}
+
+void ClFullyConnected::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+
+ auto src = tensors.get_const_tensor(ACL_SRC_0);
+
+ CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
+ CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false);
+
+ // Linearize input if it comes from a convolutional layer
+ if(_is_fc_after_conv)
+ {
+ ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+ _flatten->run(flatten_pack);
+ }
+
+ ITensorPack gemm_pack = tensors;
+ gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
+ if(_weights_to_use_idx != ACL_SRC_1)
+ {
+ gemm_pack.add_const_tensor(ACL_SRC_1, weights.get());
+ }
+
+ // Run matrix multiply
+ if(_is_quantized)
+ {
+ _mm_gemmlowp->run(gemm_pack);
+ }
+ else
+ {
+ _mm_gemm->run(gemm_pack);
+ }
+}
+
+void ClFullyConnected::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto weights = tensors.get_const_tensor(ACL_SRC_1);
+
+ CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
+ CLAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false);
+
+ // Pointer to current weights
+ const ITensor *cur_weights = weights;
+
+ // Reshape of the weights if needed (happens only once)
+ if(!_are_weights_reshaped)
+ {
+ // Run reshape weights kernel and mark weights as unused
+ ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
+ _reshape_weights->run(transpose_pack);
+
+ cur_weights->mark_as_unused();
+ cur_weights = reshaped_weights.get();
+
+ _are_weights_reshaped = true;
+ }
+
+ // Convert weights if needed (happens only once)
+ if(!_are_weights_converted)
+ {
+ ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+ _convert_weights->run(convert_pack);
+
+ cur_weights->mark_as_unused();
+ cur_weights = converted_weights.get();
+
+ _are_weights_converted = true;
+ }
+
+ tensors.add_const_tensor(ACL_SRC_1, cur_weights);
+
+ // Prepare GEMM prepare and release unused weights
+ if(!_is_quantized)
+ {
+ _mm_gemm->prepare(tensors);
+ }
+ else
+ {
+ _mm_gemmlowp->prepare(tensors);
+ }
+ _is_prepared = true;
+ }
+}
+
+experimental::MemoryRequirements ClFullyConnected::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h
new file mode 100644
index 0000000000..dc5f9e5c9b
--- /dev/null
+++ b/src/gpu/cl/operators/ClFullyConnected.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FULLY_CONNECTED_H
+#define ARM_COMPUTE_CL_FULLY_CONNECTED_H
+
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+// Forward declarations
+class ClConvertFullyConnectedWeights;
+class ClFlatten;
+class ClGemm;
+class ClGemmLowpMatrixMultiplyCore;
+class ClTranspose;
+
+/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
+ *
+ * -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer)
+ * -# @ref CLTranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
+ * -# @ref opencl::kernels::ClGemmMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
+ *
+ * @note The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class ClFullyConnected : public IClOperator
+{
+public:
+ ClFullyConnected();
+ ~ClFullyConnected();
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor. The weights must be 2 dimensional.
+ * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
+ * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
+ * Data type supported: Same as @p src.
+ * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p src.
+ * @param[out] dst Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
+ * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
+ * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
+ * Data type supported: Same as @p src.
+ * @param[in] fc_info (Optional) Fully connected layer additional info
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClFullyConnected::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+ // Inherited methods overriden
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
+ void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
+ void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
+
+private:
+ enum AuxTensorIdx
+ {
+ TransposedWeights = 10,
+ ConvertedWeights = 11,
+ FlattenedSrc = 12,
+ Count = 13
+ };
+
+ std::unique_ptr<ClConvertFullyConnectedWeights> _convert_weights;
+ std::unique_ptr<ClFlatten> _flatten;
+ std::unique_ptr<ClTranspose> _reshape_weights;
+ std::unique_ptr<ClGemm> _mm_gemm;
+ std::unique_ptr<ClGemmLowpMatrixMultiplyCore> _mm_gemmlowp;
+
+ experimental::MemoryRequirements _aux_mem{};
+
+ TensorInfo _flattened_src{};
+ TensorInfo _converted_weights{};
+ TensorInfo _reshaped_weights{};
+
+ TensorInfo _weights_to_use{};
+ int _weights_to_use_idx{ ACL_SRC_1 };
+
+ bool _are_weights_converted{ true };
+ bool _are_weights_reshaped{ true };
+ bool _is_fc_after_conv{ true };
+ bool _is_quantized{ false };
+ bool _is_prepared{ false };
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FULLY_CONNECTED_H */
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
new file mode 100644
index 0000000000..625c057cf4
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -0,0 +1,771 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemm.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Log.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
+
+#include "support/Cast.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
+using namespace arm_compute::experimental;
+using namespace arm_compute::utils::cast;
+using namespace arm_compute::opencl::kernels;
+
+namespace
+{
+inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
+{
+ switch(kernel_type)
+ {
+ case CLGEMMKernelType::NATIVE_V1:
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+ case CLGEMMKernelType::RESHAPED_V1:
+ case CLGEMMKernelType::RESHAPED:
+ {
+ return true;
+ }
+ default:
+ {
+ return false;
+ }
+ }
+}
+//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
+inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
+{
+ if(!constant_weights)
+ {
+ return CLGEMMKernelType::NATIVE_V1;
+ }
+
+ auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
+ if(bool(gemm_kernel))
+ {
+ if(validate_gemm_kernel(gemm_kernel.gemm_type))
+ {
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+ return gemm_kernel.gemm_type;
+ }
+ }
+ gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+ return gemm_kernel.gemm_type;
+}
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
+ const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)
+{
+ // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+ TensorInfo tmp_b_info{};
+ // Validate reshape RHS kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+ {
+ return false;
+ }
+ // Validate mm kernel
+ gemm_kernel_info.lhs_info = lhs_info;
+ gemm_kernel_info.rhs_info = rhs_info;
+ gemm_kernel_info.has_pad_y = false;
+ if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+ {
+ return false;
+ }
+ gemm_kernel_info.has_pad_y = true;
+ if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+ {
+ return false;
+ }
+ return true;
+}
+
+//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c, const ITensorInfo *output)
+{
+ auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
+ if(config)
+ {
+ if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
+ {
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+ return { config.lhs_info, config.rhs_info };
+ }
+ }
+ config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+ return { config.lhs_info, config.rhs_info };
+}
+
+// Validate lhs_info and rhs_info for reshaped kernel
+inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
+ const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)
+{
+ // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+
+ // Validate reshape LHS kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
+ if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
+ {
+ return false;
+ }
+
+ // Validate reshape RHS kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+ {
+ return false;
+ }
+ // Validate mm kernel
+ gemm_kernel_info.lhs_info = lhs_info;
+ gemm_kernel_info.rhs_info = rhs_info;
+ if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+ {
+ return false;
+ }
+ return true;
+}
+
+//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,
+ const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)
+{
+ auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);
+ if(config)
+ {
+ if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))
+ {
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+ return { config.lhs_info, config.rhs_info };
+ }
+ }
+ config = auto_heuristics::select_default_gemm_config_reshaped(query);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+ return { config.lhs_info, config.rhs_info };
+}
+} // namespace
+
+ClGemm::ClGemm()
+ : _mm_kernel(std::make_unique<ClGemmMatrixMultiplyKernel>()),
+ _reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()),
+ _reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
+ _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()),
+ _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
+ _mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
+ _tmp_a(),
+ _tmp_b(),
+ _reshape_b_only_on_first_run(false),
+ _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1),
+ _is_prepared(false),
+ _aux_mem(AuxTensorIdx::Count)
+{
+}
+
+void ClGemm::configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
+ const GEMMInfo &gemm_info)
+{
+ const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Set the target for the kernels
+ _mm_kernel->set_target(gpu_target);
+
+ GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
+
+ // Configure and tune matrix multiply kernel
+ _mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+
+ // Tune kernel statically
+ CLScheduler::get().tune_kernel_static(*_mm_kernel);
+}
+
+void ClGemm::configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
+ const GEMMInfo &gemm_info)
+{
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+
+ // Set the target for the kernels
+ _reshape_lhs_kernel->set_target(gpu_target);
+ _mm_kernel->set_target(gpu_target);
+
+ if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
+ {
+ mult_transpose1xW_width = 4;
+ mult_interleave4x4_height = 2;
+ }
+
+ GEMMRHSMatrixInfo rhs_info;
+ rhs_info.n0 = 16 / b->element_size();
+ rhs_info.k0 = 1;
+ rhs_info.h0 = mult_transpose1xW_width;
+ rhs_info.interleave = false;
+ rhs_info.transpose = false;
+
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = 4;
+ lhs_info.k0 = 4;
+ lhs_info.v0 = mult_interleave4x4_height;
+ lhs_info.interleave = true;
+ lhs_info.transpose = true;
+
+ GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
+
+ // Configure interleave kernel
+ _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
+
+ // Configure transpose kernel
+ _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
+
+ // Configure and tune matrix multiply kernel
+ _mm_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+
+ CLScheduler::get().tune_kernel_static(*_mm_kernel);
+
+ // Request memory for LHS and RHS reshape matrix
+ _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
+ _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+}
+
+void ClGemm::configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
+ const GEMMInfo &gemm_info)
+{
+ DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ bool broadcast_bias = gemm_info.broadcast_bias();
+
+ GEMMKernelInfo kernel_info;
+ kernel_info.m = m;
+ kernel_info.n = n;
+ kernel_info.k = k;
+ kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ kernel_info.reinterpret_input_as_3d = false;
+ kernel_info.broadcast_bias = broadcast_bias;
+ kernel_info.activation_info = gemm_info.activation_info();
+
+ // Set the target for the kernels
+ _reshape_lhs_kernel->set_target(gpu_target);
+ _mm_kernel->set_target(gpu_target);
+
+ GEMMLHSMatrixInfo lhs_info{};
+ GEMMRHSMatrixInfo rhs_info{};
+
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b,
+ c, output, gemm_info.reinterpret_input_as_3d());
+
+ _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+ _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
+
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+
+ // Request memory for LHS and RHS reshape matrix
+ _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
+ _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+}
+
+void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
+ const GEMMInfo &gemm_info)
+{
+ DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ bool broadcast_bias = gemm_info.broadcast_bias();
+
+ GEMMKernelInfo kernel_info;
+ kernel_info.m = m;
+ kernel_info.n = n;
+ kernel_info.k = k;
+ kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+ kernel_info.broadcast_bias = broadcast_bias;
+ kernel_info.activation_info = gemm_info.activation_info();
+
+ // Set the target for the kernels
+ _mm_kernel->set_target(gpu_target);
+
+ GEMMLHSMatrixInfo lhs_info{};
+ GEMMRHSMatrixInfo rhs_info{};
+
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output);
+
+ // Transpose matrix
+ _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
+
+ // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)
+ // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have
+ // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false
+
+ // Configure matrix multiply kernel with no y padding support
+ kernel_info.has_pad_y = false;
+ _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+
+ // Configure matrix multiply kernel with y padding support
+ kernel_info.has_pad_y = true;
+ _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+
+ // Request memory for RHS reshape matrix
+ _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+}
+
+Status ClGemm::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias());
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta,
+ false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
+
+ return Status{};
+}
+
+Status ClGemm::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+
+ if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
+ {
+ mult_transpose1xW_width = 4;
+ mult_interleave4x4_height = 2;
+ }
+
+ GEMMRHSMatrixInfo rhs_info;
+ rhs_info.n0 = 16 / b->element_size();
+ rhs_info.k0 = 1;
+ rhs_info.h0 = mult_transpose1xW_width;
+ rhs_info.interleave = false;
+ rhs_info.transpose = false;
+
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = 4;
+ lhs_info.k0 = 4;
+ lhs_info.v0 = mult_interleave4x4_height;
+ lhs_info.interleave = true;
+ lhs_info.transpose = true;
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
+
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta,
+ true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
+
+ return Status{};
+}
+
+Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool broadcast_bias = gemm_info.broadcast_bias();
+
+ GEMMKernelInfo kernel_info;
+ kernel_info.m = m;
+ kernel_info.n = n;
+ kernel_info.k = k;
+ kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ kernel_info.reinterpret_input_as_3d = false;
+ kernel_info.broadcast_bias = broadcast_bias;
+ kernel_info.activation_info = gemm_info.activation_info();
+
+ GEMMLHSMatrixInfo lhs_info;
+ GEMMRHSMatrixInfo rhs_info;
+
+ // Pick up the GEMM configuration
+ // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+ const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
+ lhs_info = gemm_config.lhs_info;
+ rhs_info = gemm_config.rhs_info;
+
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+
+ return Status{};
+}
+
+Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
+
+ TensorInfo tmp_b_info{};
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ const DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool broadcast_bias = gemm_info.broadcast_bias();
+
+ GEMMKernelInfo kernel_info;
+ kernel_info.m = m;
+ kernel_info.n = n;
+ kernel_info.k = k;
+ kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+ kernel_info.broadcast_bias = broadcast_bias;
+ kernel_info.activation_info = gemm_info.activation_info();
+
+ GEMMLHSMatrixInfo lhs_info;
+ GEMMRHSMatrixInfo rhs_info;
+
+ // Pick up the GEMM configuration
+ // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+ const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
+ lhs_info = gemm_config.lhs_info;
+ rhs_info = gemm_config.rhs_info;
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+ // Validate matrix multiply
+ kernel_info.has_pad_y = false;
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+
+ kernel_info.has_pad_y = true;
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+
+ return Status{};
+}
+
+void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info));
+
+ // Check if we need to reshape the matrix B only on the first run
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _is_prepared = gemm_info.retain_internal_weights();
+
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+
+ // Select GEMMType
+ _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run,
+ gemm_info.constant_weights());
+
+ const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
+
+ ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
+
+ switch(_gemm_kernel_type)
+ {
+ case CLGEMMKernelType::NATIVE_V1:
+ {
+ configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+ break;
+ }
+ case CLGEMMKernelType::RESHAPED_V1:
+ {
+ configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+ break;
+ }
+ case CLGEMMKernelType::RESHAPED:
+ {
+ configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+ break;
+ }
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+ {
+ configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("GEMMType not supported");
+ }
+ }
+}
+
+Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ // Get the GPU target
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+
+ // Select GEMMType
+ CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
+ {
+ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,
+ },
+ gemm_info.reshape_b_only_on_first_run(), gemm_info.constant_weights());
+
+ const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
+
+ const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
+
+ switch(gemm_kernel_type)
+ {
+ case CLGEMMKernelType::NATIVE_V1:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
+ break;
+ }
+ case CLGEMMKernelType::RESHAPED_V1:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
+ break;
+ }
+ case CLGEMMKernelType::RESHAPED:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));
+ break;
+ }
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
+ }
+ }
+
+ return Status{};
+}
+
+void ClGemm::run(ITensorPack &tensors)
+{
+ const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0);
+ const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1);
+ const ITensor *src2 = tensors.get_const_tensor(ACL_SRC_2);
+ ITensor *dst = tensors.get_tensor(ACL_DST);
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst);
+
+ CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true);
+ CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
+
+ // Prepare the consts if needed
+ prepare(tensors);
+
+ // Run matrix multiply kernel
+ switch(_gemm_kernel_type)
+ {
+ case CLGEMMKernelType::NATIVE_V1:
+ {
+ CLScheduler::get().enqueue_op(*_mm_kernel, tensors, true);
+ break;
+ }
+ case CLGEMMKernelType::RESHAPED_V1:
+ case CLGEMMKernelType::RESHAPED:
+ {
+ // Run interleave kernel
+ ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } };
+ CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);
+
+ if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+ CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
+ }
+
+ ITensorPack gemm_reshaped_pack{ { ACL_SRC_0, lhs_reshaped.get() }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };
+
+ if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
+ {
+ CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);
+ }
+ else
+ {
+ CLScheduler::get().enqueue_op(*_mm_kernel, gemm_reshaped_pack, true);
+ }
+ break;
+ }
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+ {
+ if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+ CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
+ }
+ // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
+ // Check if the lhs or dst tensors have padding
+ const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom;
+ const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;
+ bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
+
+ ITensorPack gemm_reshaped_onlyrhs_pack{ { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };
+ if(has_pad_y)
+ {
+ CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true);
+ }
+ else
+ {
+ CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true);
+ }
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("GEMMType not supported");
+ }
+ }
+}
+
+void ClGemm::prepare(ITensorPack &constants)
+{
+ if(!_is_prepared)
+ {
+ const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);
+ ICLTensor *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
+
+ // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
+ if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
+ {
+ ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");
+
+ CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
+ ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
+
+ ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } };
+ CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);
+ }
+ _is_prepared = true;
+ }
+}
+
+experimental::MemoryRequirements ClGemm::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h
new file mode 100644
index 0000000000..60bb78c371
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemm.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_H
+#define ARM_COMPUTE_CL_GEMM_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels:
+ *
+ * -# @ref kernels::ClGemmReshapeLhsMatrixKernel (only if the RESHAPED_V1 is selected by the heuristic model)
+ * -# @ref kernels::ClGemmReshapeRhsMatrixKernel (only if either the RESHAPED_V1 or RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
+ * -# @ref kernels::ClGemmMatrixMultiplyKernel (only if either the NATIVE or RESHAPED_V1 is selected by the select_gemm_kernel method())
+ * -# @ref kernels::ClGemmMatrixMultiplyReshapedKernel (only if RESHAPED_V1 is selected by the select_gemm_kernel method())
+ * -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
+ */
+class ClGemm : public IClOperator
+{
+public:
+ /** Constructor */
+ ClGemm();
+ /** Initialise the kernel's inputs and output
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:------------|:-----------|:---------|:--------------|
+ * |F32 |F32 |F32 |F32 |
+ * |F16 |F16 |F16 |F16 |
+ *
+ * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
+ *
+ * @note All tensors must have the same data type.
+ *
+ * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] a First input tensor (Matrix or Vector A). Data types supported: F16/F32
+ * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a.
+ * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
+ * @param[out] output Output tensor. Data type supported: same as @p a
+ * @param[in] alpha Weight of the matrix product
+ * @param[in] beta Weight of matrix C
+ * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+ * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
+ * in case matrix A and matrix B have been already transformed.
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClGemm::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ void configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+ void configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+ void configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+ void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+
+ static Status validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+ static Status validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+ static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+ static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+
+private:
+ enum AuxTensorIdx
+ {
+ LhsReshape = 0,
+ RhsReshape,
+ Count
+ };
+
+private:
+ std::unique_ptr<kernels::ClGemmMatrixMultiplyKernel> _mm_kernel;
+ std::unique_ptr<kernels::ClGemmReshapeLhsMatrixKernel> _reshape_lhs_kernel;
+ std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel> _reshape_rhs_kernel;
+ std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel> _mm_reshaped_kernel;
+ std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel;
+ std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_fallback_kernel;
+ TensorInfo _tmp_a;
+ TensorInfo _tmp_b;
+ bool _reshape_b_only_on_first_run;
+ CLGEMMKernelType _gemm_kernel_type;
+ bool _is_prepared;
+ experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMM_H */
diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
new file mode 100644
index 0000000000..0f625bc56a
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemmConv2d.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+#include "src/gpu/cl/kernels/ClCol2ImKernel.h"
+#include "src/gpu/cl/kernels/ClIm2ColKernel.h"
+#include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
+#include "src/gpu/cl/operators/ClGemm.h"
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+using namespace experimental;
+using namespace misc::shape_calculator;
+using namespace utils::cast;
+namespace opencl
+{
+ClGemmConv2d::ClGemmConv2d()
+ : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(),
+ _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+{
+}
+ClGemmConv2d::~ClGemmConv2d() = default;
+
+void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+ int gemm_3d_depth, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
+
+ const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+ false, // is_b_reshaped
+ true, // reshape_b_only_on_first_run
+ gemm_3d_depth, // depth_output_gemm3d
+ _skip_im2col, // reinterpret_input_as_3d
+ false, // retain_internal_weights
+ gemmlowp_output_stage, // gemmlowp_output_stage
+ false, // fast_math
+ false, // fp_mixed_precision
+ true, // broadcast_bias
+ act_info); // activation_info
+
+ TensorInfo tmp_src{ *src };
+ if(_is_quantized)
+ {
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo input_quantization_info = src->quantization_info();
+ const QuantizationInfo weights_quantization_info = weights->quantization_info();
+
+ tmp_src.set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+ weights->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+
+ _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
+ _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info);
+
+ // Revert back QuantizatioInfo as weights could be used in other convolution layers
+ weights->set_quantization_info(weights_quantization_info);
+
+ auto mm_mem_req = _mm_gemmlowp->workspace();
+ for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+ {
+ _aux_mem[cont] = mm_mem_req[cont];
+ }
+ }
+ else
+ {
+ // Configure matrix multiply function
+ _mm_gemm = std::make_unique<ClGemm>();
+ _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
+ auto mm_mem_req = _mm_gemm->workspace();
+ for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+ {
+ _aux_mem[cont] = mm_mem_req[cont];
+ }
+ }
+}
+
+Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
+{
+ const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+
+ const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+ false, // is_b_reshaped
+ true, // reshape_b_only_on_first_run
+ gemm_3d_depth, // depth_output_gemm3d
+ skip_im2col, // reinterpret_input_as_3d
+ false, // retain_internal_weights
+ gemmlowp_output_stage, // gemmlowp_output_stage
+ false, // fast_math
+ false, // fp_mixed_precision
+ true, // broadcast_bias
+ act_info); // activation_info
+
+ if(is_quantized)
+ {
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo input_quantization_info = src->quantization_info();
+ const QuantizationInfo weights_quantization_info = weights->quantization_info();
+
+ std::unique_ptr<ITensorInfo> src_qa = src->clone();
+ std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+ src_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+ weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+
+ // Perform validation step on GEMMLowp
+ return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info);
+ }
+ else
+ {
+ // Perform validation step on Matrix multiply function
+ return ClGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
+ }
+}
+
+void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+ const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+ ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst,
+ conv2d_info,
+ weights_info));
+
+ const DataType data_type = src->data_type();
+ const DataLayout data_layout = src->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ const unsigned int kernel_width = weights->dimension(idx_width);
+ const unsigned int kernel_height = weights->dimension(idx_height);
+ const unsigned int num_kernels = weights->dimension(idx_kernels);
+
+ const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+ _is_prepared = weights_info.retain_internal_weights();
+ _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+ _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+ _skip_col2im = data_layout == DataLayout::NHWC;
+
+ // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
+ _fuse_activation = true;
+
+ const ITensorInfo *gemm_input_to_use = src;
+ ITensorInfo *gemm_output_to_use = dst;
+
+ // Get parameters from conv_info
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride();
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
+ src->dimension(idx_height),
+ kernel_width,
+ kernel_height,
+ conv2d_info.conv_info,
+ conv2d_info.dilation);
+
+ unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
+
+ ITensorInfo *biases_to_use = biases;
+ _append_bias = false;
+
+ _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>();
+ if(conv2d_info.num_groups != 1 && biases != nullptr)
+ {
+ // num_groups != 1 can only be for NCHW
+ // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
+ biases_to_use = nullptr;
+ _append_bias = true;
+ _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, conv2d_info.num_groups);
+ }
+ else
+ {
+ _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, conv2d_info.num_groups);
+ }
+
+ // Create tensor to store im2col reshaped inputs
+ if(!_skip_im2col)
+ {
+ // Configure and tune im2col. im2col output shape is auto-initialized
+ _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>();
+
+ // Set the GPU target for im2col
+ _im2col_kernel->set_target(CLScheduler::get().target());
+ _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
+
+ // Set quantization info
+ _im2col_output.set_quantization_info(src->quantization_info());
+ CLScheduler::get().tune_kernel_static(*_im2col_kernel);
+
+ // Update GEMM input
+ gemm_input_to_use = &_im2col_output;
+ }
+
+ // Create GEMM output tensor
+ if(!_skip_col2im)
+ {
+ TensorShape shape_gemm;
+
+ // If we cannot skip col2im it means we run im2col as well
+ shape_gemm = _im2col_output.tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
+
+ _gemm_output = TensorInfo(shape_gemm, 1, data_type);
+ _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+
+ // Update GEMM output
+ gemm_output_to_use = &_gemm_output;
+ }
+
+ GEMMLowpOutputStageInfo gemmlowp_output_stage;
+ gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_output_stage.gemmlowp_offset = 0;
+
+ // Configure output stage for quantized case
+ if(_is_quantized)
+ {
+ const auto output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info;
+ const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
+ const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1;
+
+ gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
+
+ gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
+ gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
+ quantization::compute_quantized_multipliers_and_shifts(src, weights, dst,
+ gemmlowp_output_stage.gemmlowp_multipliers.data(),
+ gemmlowp_output_stage.gemmlowp_shifts.data());
+ gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
+ gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0];
+
+ PixelValue min_val{};
+ PixelValue max_val{};
+ std::tie(min_val, max_val) = get_min_max(dst->data_type());
+
+ auto min_activation = min_val.get<int32_t>();
+ auto max_activation = max_val.get<int32_t>();
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+
+ if(conv2d_info.act_info.enabled())
+ {
+ if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
+ {
+ std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+ }
+ else
+ {
+ _fuse_activation = false;
+ }
+ }
+
+ // Set the GEMMLowp output stage info
+ gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+ gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+ gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
+ }
+
+ // Configure and tune GEMM
+ // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+ const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+ configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
+
+ if(!_skip_col2im)
+ {
+ // Set the GPU target for col2im
+ _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
+ _col2im_kernel->set_target(CLScheduler::get().target());
+ // Configure and tune Col2Im
+ _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups);
+ CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
+ "Output shape does not match the expected one");
+
+ if(!_fuse_activation)
+ {
+ _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
+ _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
+ }
+
+ _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+ _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
+ _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+}
+
+Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
+
+ if(!is_quantized_per_channel)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
+ ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && (src->data_layout() == DataLayout::NCHW));
+
+ const DataLayout data_layout = src->data_layout();
+ const DataType data_type = src->data_type();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ const unsigned int kernel_width = weights->dimension(idx_width);
+ const unsigned int kernel_height = weights->dimension(idx_height);
+ const unsigned int num_kernels = weights->dimension(idx_kernels);
+
+ TensorInfo im2col_reshaped_info{};
+ TensorInfo info_gemm{};
+ TensorInfo weights_reshaped_info{};
+ const ITensorInfo *gemm_input_to_use = src;
+ const ITensorInfo *gemm_output_to_use = dst;
+ const ITensorInfo *weights_to_use = weights;
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1
+ && conv2d_info.conv_info.stride().second == 1);
+ const bool skip_col2im = data_layout == DataLayout::NHWC;
+ bool fuse_activation = true;
+
+ ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+ // Validate biases
+ if(biases != nullptr)
+ {
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ if(conv2d_info.act_info.enabled())
+ {
+ ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a());
+ }
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
+ src->dimension(idx_height),
+ kernel_width,
+ kernel_height,
+ conv2d_info.conv_info,
+ conv2d_info.dilation);
+
+ unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
+
+ const ITensorInfo *biases_to_use = biases;
+ bool append_bias = false;
+
+ if(conv2d_info.num_groups != 1 && biases != nullptr)
+ {
+ // num_groups != 1 can only be for NCHW
+ // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
+ biases_to_use = nullptr;
+ append_bias = true;
+ weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
+ }
+ else
+ {
+ weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
+ }
+
+ weights_to_use = &weights_reshaped_info;
+
+ if(!skip_im2col)
+ {
+ const Size2D kernel_dims(kernel_width, kernel_height);
+
+ // Output tensor auto initialization if not yet initialized
+ TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups == 1, conv2d_info.num_groups);
+
+ auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups));
+ gemm_input_to_use = &im2col_reshaped_info;
+ }
+
+ // Create GEMM output tensor
+ if(!skip_col2im)
+ {
+ TensorShape shape_gemm;
+
+ shape_gemm = gemm_input_to_use->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
+
+ info_gemm = TensorInfo(shape_gemm, 1, data_type);
+ info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+ gemm_output_to_use = &info_gemm;
+ }
+
+ GEMMLowpOutputStageInfo gemmlowp_output_stage;
+ gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_output_stage.gemmlowp_offset = 0;
+ gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
+
+ if(is_quantized)
+ {
+ const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+ const auto output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info;
+ const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1;
+
+ gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
+ gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
+ quantization::compute_quantized_multipliers_and_shifts(src, weights, dst,
+ gemmlowp_output_stage.gemmlowp_multipliers.data(),
+ gemmlowp_output_stage.gemmlowp_shifts.data());
+ gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
+ gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0];
+
+ int min_activation = 0;
+ int max_activation = 0;
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+
+ if(conv2d_info.act_info.enabled())
+ {
+ if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
+ {
+ std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+ }
+ else
+ {
+ fuse_activation = false;
+ }
+ }
+
+ // Set the GEMMLowp output stage info
+ gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+ gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+ gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
+ }
+
+ // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+ const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
+
+ // Validate Col2Im
+ if(!skip_col2im)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
+ }
+
+ //Validate Activation Layer
+ if(!fuse_activation)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
+ }
+
+ return Status{};
+}
+
+void ClGemmConv2d::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+
+ auto src = tensors.get_const_tensor(ACL_SRC_0);
+ auto biases = tensors.get_const_tensor(ACL_SRC_2);
+ auto dst = tensors.get_tensor(ACL_DST);
+ auto gemm_input_to_use = src;
+ auto gemm_output_to_use = dst;
+
+ CLAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
+ CLAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
+ CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
+
+ // Run im2col
+ if(!_skip_im2col)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, src },
+ { TensorType::ACL_DST, im2col_output.get() }
+ };
+ CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false);
+ gemm_input_to_use = im2col_output.get();
+ }
+ if(!_skip_col2im)
+ {
+ gemm_output_to_use = gemm_output.get();
+ }
+ ITensorPack pack_mm = tensors;
+ pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
+ pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+ if(!_append_bias)
+ {
+ pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases);
+ }
+ pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
+ // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions
+ if(_is_quantized)
+ {
+ // Run gemmlowp
+ _mm_gemmlowp->run(pack_mm);
+ }
+ else
+ {
+ // Run gemm
+ _mm_gemm->run(pack_mm);
+ }
+
+ // Reshape output matrix
+ if(!_skip_col2im)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, gemm_output_to_use },
+ { TensorType::ACL_DST, dst }
+ };
+ CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false);
+ }
+
+ //Run Activation Layer if we cannot fuse in GEMM
+ if(!_fuse_activation)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, dst },
+ { TensorType::ACL_DST, dst }
+ };
+ CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false);
+ }
+}
+
+void ClGemmConv2d::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ // Run weights reshaping and mark original weights tensor as unused
+ ICLTensor *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
+ CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, weights },
+ { TensorType::ACL_DST, weights_reshaped.get() }
+ };
+
+ if(_append_bias)
+ {
+ const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ pack.add_const_tensor(TensorType::ACL_BIAS, biases);
+ }
+ CLScheduler::get().enqueue_op(*_weights_reshape_kernel.get(), pack, true);
+ tensors.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+
+ // Prepare GEMM
+ _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
+ _is_prepared = true;
+ }
+}
+experimental::MemoryRequirements ClGemmConv2d::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
new file mode 100644
index 0000000000..9a5e381dd7
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmConv2d.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_CONV2D_H
+#define ARM_COMPUTE_CL_GEMM_CONV2D_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+class ClGemm;
+class ClGemmLowpMatrixMultiplyCore;
+namespace kernels
+{
+class ClIm2ColKernel;
+class ClCol2ImKernel;
+class ClWeightsReshapeKernel;
+class ClActivationKernel;
+} // namespace kernels
+
+/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref opencl::kernels::ClIm2ColKernel
+ * -# @ref ClGemm (if the data type is FP32 or FP16)
+ * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref ClGemmLowpOutputStage with QUANTIZE_DOWN_FIXEDPOINT type of quantization (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref opencl::kernels::ClCol2ImKernel (if NCHW data layout)
+ * -# @ref opencl::kernels::ClActivationKernel
+ */
+class ClGemmConv2d : public IClOperator
+{
+public:
+ /** Constructor */
+ ClGemmConv2d();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ ClGemmConv2d(const ClGemmConv2d &) = delete;
+ /** Default move constructor */
+ ClGemmConv2d(ClGemmConv2d &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ ClGemmConv2d &operator=(const ClGemmConv2d &) = delete;
+ /** Default move assignment operator */
+ ClGemmConv2d &operator=(ClGemmConv2d &&) = default;
+ /**Default destructor */
+ ~ClGemmConv2d();
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:--------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+ * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+ * tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input.
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+ const WeightsInfo &weights_info = WeightsInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClGemmConvolution::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &conv2d_info,
+ const WeightsInfo &weights_info = WeightsInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ /** Configures the appropriate matrix multiply routine
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
+ * QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+ * @param[in, out] dst Output tensor info. Data types supported: same as @p input.
+ * @param[in] gemmlowp_output_stage GEMMLowp output stage info
+ * @param[in] gemm_3d_depth Depth of GEMM 3D
+ * @param[in] act_info Activation to apply after the matrix multiplication
+ */
+ void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+ int gemm_3d_depth, const ActivationLayerInfo &act_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
+ *
+ * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
+ * QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+ * @param[in] dst Output tensor info. Data types supported: same as @p input.
+ * @param[in] gemmlowp_output_stage GEMMLowp output stage info
+ * @param[in] gemm_3d_depth Depth of GEMM 3D
+ * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout.
+ * @param[in] act_info Activation to apply after the matrix multiplication
+ *
+ * @return a status
+ */
+ static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+ int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
+
+ enum AuxTensorIdx
+ {
+ // ClGemmLowpMatrixMultiplyCore has up to 7 internal tensors
+ Im2ColOutput = 8,
+ WeightsReshaped,
+ GemmOutput,
+ Count
+ };
+
+ std::unique_ptr<kernels::ClWeightsReshapeKernel> _weights_reshape_kernel;
+ std::unique_ptr<kernels::ClIm2ColKernel> _im2col_kernel;
+ std::unique_ptr<ClGemm> _mm_gemm;
+ std::unique_ptr<ClGemmLowpMatrixMultiplyCore> _mm_gemmlowp;
+ std::unique_ptr<opencl::kernels::ClCol2ImKernel> _col2im_kernel;
+ std::unique_ptr<kernels::ClActivationKernel> _activation_kernel;
+
+ TensorInfo _im2col_output;
+ TensorInfo _weights_reshaped;
+ TensorInfo _gemm_output;
+
+ bool _skip_im2col;
+ bool _skip_col2im;
+ bool _is_quantized;
+ bool _fuse_activation;
+ bool _append_bias;
+ bool _is_prepared;
+
+ experimental::MemoryRequirements _aux_mem;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_CONV2D_H */
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000000..f3c0ee1c8f
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,786 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Log.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClCastKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
+
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
+using namespace arm_compute::opencl::kernels;
+using namespace arm_compute::experimental;
+
+namespace
+{
+inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
+{
+ switch(kernel_type)
+ {
+ case CLGEMMKernelType::NATIVE:
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+ {
+ return true;
+ }
+ default:
+ {
+ return false;
+ }
+ }
+}
+
+//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
+inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
+{
+ auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
+ if(bool(gemm_kernel))
+ {
+ if(validate_gemm_kernel(gemm_kernel.gemm_type))
+ {
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+ return gemm_kernel.gemm_type;
+ }
+ }
+ gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+ return gemm_kernel.gemm_type;
+}
+
+// Validate lhs_info and rhs_info for native kernel
+inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
+{
+ // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+ TensorInfo mm_result_s32_info{};
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
+ // Validate mm kernel
+ // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
+ // NOTE: This assumes:
+ // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
+ // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
+ if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
+ {
+ return false;
+ }
+ return true;
+}
+
+// Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
+{
+ auto config = auto_heuristics::select_mlgo_gemm_config_native(query);
+ if(config)
+ {
+ if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
+ {
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+ return { config.lhs_info, config.rhs_info };
+ }
+ }
+ config = auto_heuristics::select_default_gemm_config_native(query);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+ return { config.lhs_info, config.rhs_info };
+}
+
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
+ unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+{
+ // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+ TensorInfo tmp_b_info{};
+ // Validate reshape RHS kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+ {
+ return false;
+ }
+ // Validate mm kernel
+ // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
+ // NOTE: This assumes:
+ // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
+ // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
+ GEMMKernelInfo gemm_kernel_info;
+ gemm_kernel_info.m = m;
+ gemm_kernel_info.n = n;
+ gemm_kernel_info.k = k;
+ gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+ gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ gemm_kernel_info.lhs_info = lhs_info;
+ gemm_kernel_info.rhs_info = rhs_info;
+ // Since we ignore the output stage, output data type has to be S32 to pass the validation
+ TensorInfo output_info_copy(*output);
+ output_info_copy.set_data_type(DataType::S32);
+ if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+ {
+ return false;
+ }
+ return true;
+}
+
+// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
+ const ITensorInfo *a,
+ const ITensorInfo *b, const ITensorInfo *output)
+{
+ auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
+ if(config)
+ {
+ if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d))
+ {
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+ return { config.lhs_info, config.rhs_info };
+ }
+ }
+ config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+ return { config.lhs_info, config.rhs_info };
+}
+
+inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
+{
+ switch(kernel_type)
+ {
+ case CLGEMMKernelType::NATIVE:
+ return false;
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+ return true;
+ default:
+ ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
+ }
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore()
+ : _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
+ _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
+ _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
+ _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
+ _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+ _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
+ _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()),
+ _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()),
+ _aux_mem(AuxTensorIdx::Count)
+{
+}
+
+ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default;
+
+void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
+ ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output,
+ const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+ ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c != nullptr ? c : nullptr, output, gemm_info));
+
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _a_offset = a->quantization_info().uniform().offset;
+ _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
+ && a->data_type() == DataType::QASYMM8;
+ _b_offset = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
+ _gemm_info = gemm_info;
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Set the target for the kernels
+ _mm_native_kernel->set_target(gpu_target);
+ _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
+
+ GEMMRHSMatrixInfo rhs_info;
+ GEMMLHSMatrixInfo lhs_info;
+
+ // Arguments used by GEMMReshapeInfo
+ // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+ // in order to know how the matrices have been reshaped
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+
+ const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+ // Check if we need to reshape the matrix A and matrix B
+ _is_gemm_reshaped = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run));
+
+ if(_convert_to_qasymm8)
+ {
+ // Set data type for converted weights
+ _qasymm8_weights = *b;
+ _qasymm8_weights.set_data_type(DataType::QASYMM8);
+ _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP);
+ }
+
+ ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
+ if(_is_gemm_reshaped)
+ {
+ matrix_b = &_tmp_b;
+
+ // Pick up the GEMM configuration
+ // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+ std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
+ depth_output_gemm3d,
+ a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+
+ // Configure reshape RHS kernel
+ _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+ }
+
+ // Using default reduction info
+ const GEMMLowpReductionKernelInfo reduction_info {};
+
+ // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0)
+ {
+ _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+ // Configure Matrix B reduction kernel
+ _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
+ }
+
+ // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+ // Configure matrix A reduction kernel
+ _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
+ }
+
+ GEMMKernelInfo gemm_kernel_info;
+ gemm_kernel_info.m = m;
+ gemm_kernel_info.n = n;
+ gemm_kernel_info.k = k;
+ gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+ gemm_kernel_info.lhs_info = lhs_info;
+ gemm_kernel_info.rhs_info = rhs_info;
+ gemm_kernel_info.a_offset = _a_offset;
+ gemm_kernel_info.b_offset = _b_offset;
+ // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+ if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ {
+ // Configure offset contribution kernel
+ const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+
+ _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
+ _gemm_output_stage_shifts = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
+
+ GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
+ gemmlowp_output_stage.output_data_type = a->data_type();
+ if(num_filters == 1)
+ {
+ // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
+ // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
+ gemmlowp_output_stage.is_quantized_per_channel = false;
+ }
+
+ gemm_kernel_info.output_stage = gemmlowp_output_stage;
+
+ if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ // Configure and tune matrix multiply kernel with fused output stage
+ _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+ }
+ else
+ {
+ _run_output_stage = true;
+
+ if(_is_gemm_reshaped)
+ {
+ _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+ std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
+ a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
+
+ // Configure matrix multiply kernel
+ _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
+
+ _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
+ c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,
+ &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+ }
+ }
+ }
+ else
+ {
+ _run_offset_contribution = true;
+ if(_is_gemm_reshaped)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+ std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
+ a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
+
+ // Configure matrix multiply kernel
+ _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
+ }
+
+ // Configure offset contribution kernel
+ _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
+ c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset);
+ }
+
+ // Request memory
+ _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
+ if(_is_gemm_reshaped)
+ {
+ // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
+ _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
+ _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+ }
+ if(_a_offset != 0)
+ {
+ _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size());
+ }
+ if(_b_offset != 0)
+ {
+ _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+ }
+ _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+ _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size());
+ _aux_mem[Shifts] = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
+}
+
+Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+ int32_t a_offset = a->quantization_info().uniform().offset;
+ int32_t b_offset = b->quantization_info().uniform().offset;
+
+ const ITensorInfo *matrix_a_info = a;
+
+ TensorInfo tmp_b_info{};
+ GEMMRHSMatrixInfo rhs_info;
+ GEMMLHSMatrixInfo lhs_info;
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+
+ bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run()));
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+ bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
+ && is_data_type_quantized_asymmetric(a->data_type());
+ TensorInfo weights_info(*b);
+ if(convert_to_qasymm8)
+ {
+ b_offset = -128;
+ weights_info.set_data_type(DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
+ }
+ const ITensorInfo *matrix_b_info = &weights_info;
+ if(reshape_matrix_b)
+ {
+ matrix_b_info = &tmp_b_info;
+
+ // Pick up the GEMM configuration
+ // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+ // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+ const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
+ lhs_info = res.lhs_info;
+ rhs_info = res.rhs_info;
+
+ // Validate reshape RHS kernel
+ auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
+ }
+
+ TensorInfo info_vector_sum_col{};
+ TensorInfo info_vector_sum_row{};
+
+ const GEMMLowpReductionKernelInfo reduction_info;
+ // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+ if(a_offset != 0)
+ {
+ info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
+
+ // Configure Matrix B reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
+ }
+
+ // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(b_offset != 0)
+ {
+ info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+ // Configure matrix A reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
+ }
+
+ GEMMKernelInfo gemm_kernel_info;
+ gemm_kernel_info.m = m;
+ gemm_kernel_info.n = n;
+ gemm_kernel_info.k = k;
+ gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+ gemm_kernel_info.lhs_info = lhs_info;
+ gemm_kernel_info.rhs_info = rhs_info;
+ gemm_kernel_info.a_offset = a_offset;
+ gemm_kernel_info.b_offset = b_offset;
+ if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ {
+ const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+
+ const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
+ GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
+ gemmlowp_output_stage.output_data_type = a->data_type();
+
+ gemm_kernel_info.output_stage = gemmlowp_output_stage;
+ if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ &gemm_output_stage_multipliers_shifts_info,
+ &gemm_output_stage_multipliers_shifts_info));
+ }
+ else
+ {
+ TensorInfo mm_result_s32_info{};
+
+ if(reshape_matrix_b)
+ {
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
+ }
+ else
+ {
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
+
+ // Pick up the GEMM configuration
+ // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+ // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+ const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
+ lhs_info = res.lhs_info;
+ rhs_info = res.rhs_info;
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+ }
+
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ output,
+ a_offset, b_offset,
+ gemmlowp_output_stage,
+ &gemm_output_stage_multipliers_shifts_info,
+ &gemm_output_stage_multipliers_shifts_info));
+ }
+ }
+ else
+ {
+ if(reshape_matrix_b)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+ const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
+ lhs_info = res.lhs_info;
+ rhs_info = res.rhs_info;
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+ }
+
+ if(output->total_size() != 0)
+ {
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ a_offset, b_offset));
+ }
+ }
+
+ return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
+{
+ const ITensor *a = tensors.get_const_tensor(ACL_SRC_0);
+ const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
+ const ITensor *c = tensors.get_const_tensor(ACL_SRC_2);
+ ITensor *dst = tensors.get_tensor(ACL_DST);
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, dst);
+
+ CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
+ CLAuxTensorHandler vec_sum_row(offset_int_vec(VecSumRow), _vector_sum_row, tensors, true);
+ CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, true);
+ CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
+ CLAuxTensorHandler res32(offset_int_vec(ResultS32), _mm_result_s32, tensors, true);
+ CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, true);
+ CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, true);
+
+ // Prepare the consts if needed
+ prepare(tensors);
+
+ const ITensor *matrix_a = a;
+ const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
+
+ if(_is_gemm_reshaped)
+ {
+ matrix_b = tmp_b.get();
+ if(!_reshape_b_only_on_first_run)
+ {
+ // Run reshape matrix B
+ ITensorPack mtx_b_reshape_pack =
+ {
+ { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
+ { TensorType::ACL_DST, tmp_b.get() }
+ };
+ CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
+ }
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+ {
+ ITensorPack mtx_b_red_pack =
+ {
+ { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
+ { TensorType::ACL_DST, vec_sum_col.get() }
+ };
+ CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
+ }
+
+ // Run matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ ITensorPack mtx_a_red_pack =
+ {
+ { TensorType::ACL_SRC, matrix_a },
+ { TensorType::ACL_DST, vec_sum_row.get() }
+ };
+ CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
+ }
+
+ // Run matrix multiply
+ if(_is_gemm_reshaped)
+ {
+ ITensorPack gemm_reshaped_pack;
+ if(_run_offset_contribution)
+ {
+ gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a },
+ { TensorType::ACL_SRC_1, matrix_b },
+ { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst }
+ });
+ }
+ else
+ {
+ gemm_reshaped_pack = ITensorPack(
+ {
+ { TensorType::ACL_SRC, matrix_a },
+ { TensorType::ACL_SRC_1, matrix_b },
+ { TensorType::ACL_BIAS, c },
+ { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
+ { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
+ { TensorType::ACL_SHIFTS, shifts.get() },
+ { TensorType::ACL_MULTIPLIERS, multipliers.get() },
+ { TensorType::ACL_DST, dst },
+ });
+ }
+ CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
+ }
+ else
+ {
+ ITensorPack gemm_native_pack =
+ {
+ { TensorType::ACL_SRC_0, matrix_a },
+ { TensorType::ACL_SRC_1, matrix_b },
+ { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() }
+ };
+ CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
+ }
+ if(_run_output_stage)
+ {
+ // Run offset contribution/output stage kernel
+ ITensorPack output_stage_pack =
+ {
+ { TensorType::ACL_SRC, res32.get() },
+ { TensorType::ACL_BIAS, c },
+ { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
+ { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
+ { TensorType::ACL_SHIFTS, shifts.get() },
+ { TensorType::ACL_MULTIPLIERS, multipliers.get() },
+ { TensorType::ACL_DST, dst },
+ };
+ CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
+ }
+ if(_run_offset_contribution)
+ {
+ // Run offset contribution kernel
+ ITensorPack offset_contrib_pack =
+ {
+ { TensorType::ACL_SRC_DST, dst },
+ { TensorType::ACL_BIAS, c },
+ { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
+ { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }
+ };
+ CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
+ }
+}
+
+void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
+ CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
+ CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, false);
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(b);
+
+ if(_convert_to_qasymm8)
+ {
+ ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } };
+ CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
+ b->mark_as_unused();
+ }
+
+ if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
+ {
+ // Run reshape kernel and mark original weights tensor as unused
+ ITensorPack mtx_b_pack =
+ {
+ { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
+ { TensorType::ACL_DST, tmp_b.get() }
+ };
+ CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
+ b->mark_as_unused();
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0 && _reshape_b_only_on_first_run)
+ {
+ ITensorPack mtx_b_red_pack =
+ {
+ { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
+ { TensorType::ACL_DST, vec_sum_col.get() }
+ };
+ CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
+ }
+
+ // Compute GEMM output multipliers and shifts for output stage
+ {
+ const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+
+ CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
+ CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
+
+ ICLTensor *multiplier_tensor = multipliers.get();
+ if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
+ {
+ multiplier_tensor->map(CLScheduler::get().queue(), true);
+ std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
+ multiplier_tensor->unmap(CLScheduler::get().queue());
+ }
+
+ ICLTensor *shifts_tensor = shifts.get();
+ if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
+ {
+ shifts_tensor->map(CLScheduler::get().queue(), true);
+ std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
+ shifts_tensor->unmap(CLScheduler::get().queue());
+ }
+ }
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
+
+experimental::MemoryRequirements ClGemmLowpMatrixMultiplyCore::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
new file mode 100644
index 0000000000..1965e3f97b
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+// Forward declarations
+class ClCastKernel;
+class ClGemmLowpMatrixMultiplyNativeKernel;
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel;
+class ClGemmReshapeRhsMatrixKernel;
+class ClGemmLowpMatrixAReductionKernel;
+class ClGemmLowpMatrixBReductionKernel;
+class ClGemmLowpOffsetContributionKernel;
+class ClGemmLowpOffsetContributionOutputStageKernel;
+} // namespace kernels
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
+class ClGemmLowpMatrixMultiplyCore : public IClOperator
+{
+public:
+ ClGemmLowpMatrixMultiplyCore();
+ ~ClGemmLowpMatrixMultiplyCore();
+ /** Initialise the kernel's inputs, output
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:--------|:--------------|
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QASYMM8 |S32 |S32 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |S32 |
+ * |QASYMM8 |QSYMM8 |S32 |S32 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8 |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 |
+ * |QASYMM8_SIGNED |QSYMM8 |S32 |S32 |
+ *
+ * @note GEMMLowp: low precision GEMM kernel. [A * B + C]
+ * This kernel performs the following computations:
+ *
+ * -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
+ * -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
+ * -# Compute the matrix product of the resulting a * b in int32.
+ * -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
+ * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a
+ * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32
+ * @param[out] output Output tensor. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE
+ * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+ * if the reshape of matrix B should be executed only for the first run
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClGemmLowpMatrixMultiplyCore::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ enum AuxTensorIdx
+ {
+ ResultS32 = 0,
+ RhsQAsymm8,
+ RhsReshape,
+ VecSumCol,
+ VecSumRow,
+ Multipliers,
+ Shifts,
+ Count
+ };
+
+private:
+ // Kernels used
+ std::unique_ptr<kernels::ClCastKernel> _weights_to_qasymm8;
+ std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyNativeKernel> _mm_native_kernel;
+ std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel;
+ std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel;
+ std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel;
+ std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel;
+ std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel> _offset_contribution_kernel;
+ std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
+
+ // Temporary tensors
+ TensorInfo _qasymm8_weights{};
+ TensorInfo _vector_sum_col{};
+ TensorInfo _vector_sum_row{};
+ TensorInfo _tmp_b{};
+ TensorInfo _mm_result_s32{};
+ TensorInfo _gemm_output_stage_multipliers{};
+ TensorInfo _gemm_output_stage_shifts{};
+
+ int32_t _a_offset{ 0 };
+ int32_t _b_offset{ 0 };
+ bool _is_gemm_reshaped{ true };
+ bool _reshape_b_only_on_first_run{ false };
+ bool _run_output_stage{ false };
+ bool _convert_to_qasymm8{ false };
+ bool _run_offset_contribution{ false };
+ bool _is_prepared{ false };
+ GEMMInfo _gemm_info{};
+
+ experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H */ \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
new file mode 100644
index 0000000000..27fb89217c
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemmLowpOutputStage.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ switch(info.type)
+ {
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+ {
+ auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel>();
+ k->configure(compile_context, src, bias, dst, &info);
+ _kernel = std::move(k);
+ break;
+ }
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+ {
+ auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel>();
+ k->configure(compile_context, src, bias, dst, &info);
+ _kernel = std::move(k);
+ break;
+ }
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
+ {
+ auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel>();
+ k->configure(compile_context, src, bias, dst, &info);
+ _kernel = std::move(k);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
+ }
+}
+
+Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
+
+ switch(info.type)
+ {
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+ return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info);
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+ return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info);
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
+ return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(src, bias, dst, &info);
+ default:
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
+ }
+}
+
+void ClGemmLowpOutputStage::run(ITensorPack &tensors)
+{
+ const ITensor *src = tensors.get_const_tensor(ACL_SRC);
+ const ITensor *bias = tensors.get_const_tensor(ACL_BIAS);
+ ITensor *dst = tensors.get_tensor(ACL_DST);
+
+ ITensorPack pack{ { ACL_SRC, src }, { ACL_BIAS, bias }, { ACL_DST, dst } };
+ CLScheduler::get().enqueue_op(*_kernel, pack, true);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
new file mode 100644
index 0000000000..3f1b04dcce
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H
+#define ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+/** This file contains all available output stages for GEMMLowp on OpenCL.
+ *
+ * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore),
+ * and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ *
+ * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
+ */
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute GEMMLowpQuantizeDown kernels on CL.
+ *
+ * This function calls the following CL kernels:
+ *
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
+*/
+class ClGemmLowpOutputStage : public IClOperator
+{
+public:
+ /** Constructor */
+ ClGemmLowpOutputStage() = default;
+ /** Initialise the kernel's inputs, output
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |dst |
+ * |:--------------|:-------------|:-------------|
+ * |S32 |S32 |QASYMM8 |
+ * |S32 |S32 |QASYMM8_SIGNED|
+ * |S32 |S32 |QSYMM16 |
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+ * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+ * @param[in] info GEMMLowp output stage metadata.
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClGemmLowpOutputStage::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H */
diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp
new file mode 100644
index 0000000000..b909066e4c
--- /dev/null
+++ b/src/gpu/cl/operators/ClLogicalNot.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClLogicalNot.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClLogicalNot::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+ k->configure(compile_context, src, dst, ElementWiseUnary::LOGICAL_NOT);
+ _kernel = std::move(k);
+}
+
+Status ClLogicalNot::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOGICAL_NOT);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClLogicalNot.h b/src/gpu/cl/operators/ClLogicalNot.h
new file mode 100644
index 0000000000..31d4a99be6
--- /dev/null
+++ b/src/gpu/cl/operators/ClLogicalNot.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_LOGICAL_NOT_H
+#define ARM_COMPUTE_CL_LOGICAL_NOT_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClElementWiseUnaryKernel for NOT operation */
+class ClLogicalNot : public IClOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: U8.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClLogicalNot::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_LOGICAL_NOT_H */
diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp
new file mode 100644
index 0000000000..59d2b96bee
--- /dev/null
+++ b/src/gpu/cl/operators/ClMul.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClMul.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClMulKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
+ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+{
+ auto k = std::make_unique<kernels::ClMulKernel>();
+ k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
+ _kernel = std::move(k);
+}
+
+Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
+ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+{
+ return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
+}
+
+void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ auto k = std::make_unique<kernels::ClComplexMulKernel>();
+ k->configure(compile_context, src1, src2, dst, act_info);
+ _kernel = std::move(k);
+}
+
+Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h
new file mode 100644
index 0000000000..6a158c910d
--- /dev/null
+++ b/src/gpu/cl/operators/ClMul.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_MUL_H
+#define ARM_COMPUTE_CL_MUL_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref opencl::kernels::ClMulKernel */
+class ClMul : public IClOperator
+{
+public:
+ /** Initialise the kernel's sources, dst and convertion policy.
+ *
+ * Valid configurations (src1,src2) -> Output :
+ *
+ * - (U8,U8) -> U8
+ * - (U8,U8) -> S16
+ * - (U8,S16) -> S16
+ * - (S16,U8) -> S16
+ * - (S16,S16) -> S16
+ * - (F16,F16) -> F16
+ * - (F32,F32) -> F32
+ * - (QASYMM8,QASYMM8) -> QASYMM8
+ * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+ * - (QSYMM16,QSYMM16) -> QSYMM16
+ * - (QSYMM16,QSYMM16) -> S32
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+ * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+ * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+ * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+ * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
+ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClMul::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
+ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClComplexMulKernel */
+class ClComplexMul : public IClOperator
+{
+public:
+ /** Initialise the kernel's sources, dst.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] src1 An src tensor info. Data types supported: F16/F32. Number of channels supported: 2.
+ * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+ * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClComplexMul::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_MUL_H */ \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp
new file mode 100644
index 0000000000..05717d5bb7
--- /dev/null
+++ b/src/gpu/cl/operators/ClPRelu.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPRelu.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using KernelType = kernels::ClArithmeticKernel;
+void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
+{
+ auto k = std::make_unique<KernelType>();
+ k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
+ _kernel = std::move(k);
+}
+
+Status ClPRelu::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+ return KernelType::validate(ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
+}
+
+void ClPRelu::run(ITensorPack &tensors)
+{
+ // Output tensor can be given as nullptr for in-place computation.
+ // In this case, get the input tensor and use it as the output tensor.
+ if(tensors.get_tensor(TensorType::ACL_DST) == nullptr)
+ {
+ auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation");
+ tensors.add_tensor(TensorType::ACL_DST, src_tensor);
+ }
+ IClOperator::run(tensors);
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h
new file mode 100644
index 0000000000..8084ab86cd
--- /dev/null
+++ b/src/gpu/cl/operators/ClPRelu.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_PRELU_H
+#define ARM_COMPUTE_CL_PRELU_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic operator to run @ref arm_compute::opencl::kernels::ClArithmeticKernel for PRELU
+ *
+ * @note The operator implements an activation layer with the PRELU activation function.
+ */
+class ClPRelu : public IClOperator
+{
+public:
+ /** Set the input and output tensor.
+ *
+ * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClPRelu::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_PRELU_H */
diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp
new file mode 100644
index 0000000000..ed74e22b6c
--- /dev/null
+++ b/src/gpu/cl/operators/ClPermute.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPermute.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClPermuteKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+{
+ auto k = std::make_unique<kernels::ClPermuteKernel>();
+ k->configure(compile_context, src, dst, perm);
+ _kernel = std::move(k);
+}
+
+Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+ return kernels::ClPermuteKernel::validate(src, dst, perm);
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h
new file mode 100644
index 0000000000..3e87329f9b
--- /dev/null
+++ b/src/gpu/cl/operators/ClPermute.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_PERMUTE_H
+#define ARM_COMPUTE_CL_PERMUTE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClPermuteKernel */
+class ClPermute : public IClOperator
+{
+public:
+ /** Initialise the kernel's inputs and outputs and permute vector
+ *
+ * @note Arbitrary permutation vectors are supported with rank not greater than 4
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The src tensor info. Data types supported: All.
+ * @param[in] dst The dst tensor info. Data types supported: Same as @p src
+ * @param[in] perm Permutation vector
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClPermute::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_PERMUTE_H */ \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp
new file mode 100644
index 0000000000..fdadd199fc
--- /dev/null
+++ b/src/gpu/cl/operators/ClPool2d.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPool2d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClPool2dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ // Configure pooling kernel
+ auto k = std::make_unique<kernels::ClPool2dKernel>();
+ k->set_target(CLScheduler::get().target());
+ k->configure(compile_context, src, dst, info, indices);
+ _pooling = std::move(k);
+
+ const DataType data_type = src->data_type();
+
+ // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+ BorderMode border_mode{};
+ PixelValue pixel_value(0.f);
+ if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding)
+ {
+ pixel_value = PixelValue(0, data_type, src->quantization_info());
+ }
+
+ // Data layout
+ const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+
+ switch(data_layout)
+ {
+ case DataLayout::NCHW:
+ border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+ break;
+ case DataLayout::NHWC:
+ border_mode = BorderMode::CONSTANT;
+ if(PoolingType::MAX == info.pool_type)
+ {
+ if(is_data_type_quantized(data_type))
+ {
+ std::tie(pixel_value, std::ignore) = get_min_max(data_type);
+ }
+ else
+ {
+ pixel_value = PixelValue(std::numeric_limits<float>::lowest());
+ }
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
+ auto b = std::make_unique<CLFillBorderKernel>();
+ b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value);
+ _border_handler = std::move(b);
+
+ // Tune kernels
+ CLScheduler::get().tune_kernel_static(*_pooling);
+}
+
+Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
+{
+ return kernels::ClPool2dKernel::validate(src, dst, info, indices);
+}
+
+void ClPool2d::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+ CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
+ CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h
new file mode 100644
index 0000000000..a041053bb3
--- /dev/null
+++ b/src/gpu/cl/operators/ClPool2d.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL2D_H
+#define ARM_COMPUTE_CL_POOL2D_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref opencl::ClPool2d
+ */
+class ClPool2d : public IClOperator
+{
+public:
+ /** Constructor */
+ ClPool2d() = default;
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] dst Destination tensor info. Data type supported: same as @p src
+ * @param[in] info Pooling layer parameters.
+ * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32.
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClPool2d::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
+
+ // Inherited method overridden
+ void run(ITensorPack &tensors) override;
+
+private:
+ std::unique_ptr<ICLKernel> _pooling{ nullptr };
+ std::unique_ptr<ICLKernel> _border_handler{ nullptr };
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL2D_H */
diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp
new file mode 100644
index 0000000000..915e0fdef0
--- /dev/null
+++ b/src/gpu/cl/operators/ClQuantize.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClQuantize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClQuantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClQuantizeKernel>();
+ k->configure(compile_context, src, dst);
+ _kernel = std::move(k);
+}
+
+Status ClQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClQuantizeKernel::validate(src, dst);
+}
+
+void ClQuantize::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClQuantize.h b/src/gpu/cl/operators/ClQuantize.h
new file mode 100644
index 0000000000..3e50fcefb3
--- /dev/null
+++ b/src/gpu/cl/operators/ClQuantize.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_QUANTIZE_H
+#define ARM_COMPUTE_CL_QUANTIZE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClQuantizeKernel that dequantizes an input tensor */
+class ClQuantize : public IClOperator
+{
+public:
+ /** Set the input and output tensors.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32.
+ * @param[out] dst Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+ *
+ * @note Output auto initialization is not supported by this function
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClQuantize::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited method overridden
+ void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_QUANTIZE_H */
diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp
new file mode 100644
index 0000000000..2c1d1817d1
--- /dev/null
+++ b/src/gpu/cl/operators/ClReshape.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClReshape.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClReshapeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClReshape::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClReshapeKernel>();
+ k->configure(compile_context, src, dst);
+ _kernel = std::move(k);
+}
+
+Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClReshapeKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClReshape.h b/src/gpu/cl/operators/ClReshape.h
new file mode 100644
index 0000000000..fee69a1c24
--- /dev/null
+++ b/src/gpu/cl/operators/ClReshape.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_RESHAPE_H
+#define ARM_COMPUTE_CL_RESHAPE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClReshapeKernel */
+class ClReshape : public IClOperator
+{
+public:
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Input tensor info. Data type supported: All
+ * @param[out] output Output info. Data type supported: Same as @p input
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClReshape::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_RESHAPE_H */ \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp
new file mode 100644
index 0000000000..6dab66786a
--- /dev/null
+++ b/src/gpu/cl/operators/ClScale.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClScale.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClScaleKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ // Configure Scale kernel
+ auto k = std::make_unique<kernels::ClScaleKernel>();
+ k->set_target(CLScheduler::get().target());
+ k->configure(compile_context, src, dst, info);
+ _kernel = std::move(k);
+
+ // Tune kernel
+ CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+ return kernels::ClScaleKernel::validate(src, dst, info);
+}
+
+void ClScale::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h
new file mode 100644
index 0000000000..af97cf23e7
--- /dev/null
+++ b/src/gpu/cl/operators/ClScale.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_SCALE_H
+#define ARM_COMPUTE_CL_SCALE_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a scale layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref kernels::ClScaleKernel
+ */
+class ClScale : public IClOperator
+{
+public:
+ /** Constructor */
+ ClScale() = default;
+ /** Initialize the function's source, destination, interpolation type and border_mode.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] src Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+ * @param[out] dst Destination tensor info. Data types supported: Same as @p src
+ * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+ * @param[in] info @ref ScaleKernelInfo descriptor to be used to configure
+ */
+ void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClScale::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
+
+ // Inherited method overridden
+ void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLSCALE_H */
diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp
new file mode 100644
index 0000000000..6b728f5354
--- /dev/null
+++ b/src/gpu/cl/operators/ClSoftmax.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClSoftmax.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
+#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+#include "src/gpu/cl/operators/ClPermute.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "support/Cast.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace opencl
+{
+ClSoftmax::ClSoftmax()
+ : _permute_input(std::make_unique<ClPermute>()),
+ _permute_output(std::make_unique<ClPermute>()),
+ _max_shift_exp_sum_kernel(std::make_unique<kernels::ClLogits1DMaxShiftExpSumKernel>()),
+ _norm_kernel(std::make_unique<kernels::ClLogits1DNormKernel>()),
+ _max_info(),
+ _sum_info(),
+ _tmp_info(),
+ _permuted_src_info(),
+ _permuted_dst_info(),
+ _aux_mem(InternalTensorIdx::COUNT)
+{
+}
+
+void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info));
+
+ const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
+
+ _needs_permute = actual_axis != 0;
+
+ const ITensorInfo &tmp_input_info = _needs_permute ? _permuted_src_info : src;
+ ITensorInfo &tmp_output_info = _needs_permute ? _permuted_dst_info : dst;
+
+ if(_needs_permute)
+ {
+ const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+ _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info);
+ }
+
+ DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
+ _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type);
+
+ TensorShape max_sum_shape = tmp_input_info.tensor_shape();
+ _max_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape);
+ _sum_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type);
+
+ // Set GPU target to kernels
+ _max_shift_exp_sum_kernel->set_target(CLScheduler::get().target());
+
+ _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info);
+ _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info);
+
+ if(_needs_permute)
+ {
+ const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+ _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info);
+ }
+
+ _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
+ _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
+ _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
+
+ _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size());
+ _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size());
+}
+
+Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported");
+ ARM_COMPUTE_UNUSED(info.beta);
+ ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis);
+
+ const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
+ const bool needs_permute = actual_axis != 0;
+ if(needs_permute)
+ {
+ const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+ const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
+ TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector));
+ TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape));
+ ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector));
+ }
+
+ // Create intermediate tensor info
+ DataType tmp_data_type = is_data_type_quantized_asymmetric(src.data_type()) ? DataType::S32 : src.data_type();
+ TensorInfo tensor_info_tmp(src.clone()->set_data_type(tmp_data_type).set_is_resizable(true));
+
+ TensorShape max_sum_shape = src.tensor_shape();
+ max_sum_shape.set(0, 1);
+ TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
+ TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info));
+
+ return Status{};
+}
+
+void ClSoftmax::run(ITensorPack &tensors)
+{
+ auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ CLAuxTensorHandler sum(offset_int_vec(InternalTensorIdx::SUM), _sum_info, tensors, false);
+ CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false);
+ CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false);
+
+ CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false);
+ CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false);
+
+ if(_needs_permute)
+ {
+ ITensorPack pack;
+ pack.add_const_tensor(TensorType::ACL_SRC, src);
+ pack.add_tensor(TensorType::ACL_DST, permuted_src.get());
+ _permute_input.get()->run(pack);
+ }
+
+ ITensorPack sum_pack;
+ ITensorPack norm_pack;
+ if(_needs_permute)
+ {
+ sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get());
+ norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get());
+ }
+ else
+ {
+ sum_pack.add_const_tensor(TensorType::ACL_SRC, src);
+ norm_pack.add_tensor(TensorType::ACL_DST, dst);
+ }
+ sum_pack.add_tensor(TensorType::ACL_DST, tmp.get());
+ sum_pack.add_tensor(TensorType::ACL_INT_0, max.get());
+ sum_pack.add_tensor(TensorType::ACL_INT_1, sum.get());
+
+ norm_pack.add_const_tensor(TensorType::ACL_SRC, tmp.get());
+ norm_pack.add_tensor(TensorType::ACL_INT_0, sum.get());
+
+ CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false);
+ CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false);
+
+ if(_needs_permute)
+ {
+ ITensorPack pack;
+ pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get());
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _permute_output.get()->run(pack);
+ }
+}
+
+experimental::MemoryRequirements ClSoftmax::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h
new file mode 100644
index 0000000000..6c9af585d6
--- /dev/null
+++ b/src/gpu/cl/operators/ClSoftmax.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_SOFTMAX_H
+#define ARM_COMPUTE_CL_SOFTMAX_H
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+struct SoftmaxKernelInfo;
+
+namespace opencl
+{
+class ClPermute;
+namespace kernels
+{
+class ClLogits1DMaxShiftExpSumKernel;
+class ClLogits1DNormKernel;
+} // namespace kernels
+class ClSoftmax : public IClOperator
+{
+public:
+ /** Constructor */
+ ClSoftmax();
+ /** Configure the operator
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src
+ * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClSoftmax::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ enum InternalTensorIdx
+ {
+ MAX = 0,
+ SUM,
+ TMP,
+ PERMUTED_SRC,
+ PERMUTED_DST,
+ COUNT
+ };
+
+ std::unique_ptr<ClPermute> _permute_input;
+ std::unique_ptr<ClPermute> _permute_output;
+ std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel;
+ std::unique_ptr<kernels::ClLogits1DNormKernel> _norm_kernel;
+ bool _needs_permute{ false };
+
+ TensorInfo _max_info;
+ TensorInfo _sum_info;
+ TensorInfo _tmp_info;
+ TensorInfo _permuted_src_info;
+ TensorInfo _permuted_dst_info;
+
+ experimental::MemoryRequirements _aux_mem{};
+};
+
+} // opencl
+} // arm_compute
+#endif /* ARM_COMPUTE_CL_SOFTMAX_H */ \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp
new file mode 100644
index 0000000000..b94fef3cf9
--- /dev/null
+++ b/src/gpu/cl/operators/ClSub.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClSub.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+ ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+ auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
+ k->configure(compile_context, ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
+ _kernel = std::move(k);
+}
+
+Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
+ ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+ return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h
new file mode 100644
index 0000000000..902adbf39d
--- /dev/null
+++ b/src/gpu/cl/operators/ClSub.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_SUB_H
+#define ARM_COMPUTE_CL_SUB_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run arithmetic subtraction
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @note The function performs an arithmetic subtraction between two tensors.
+ */
+class ClSub : public IClOperator
+{
+public:
+ /** Configure function for a given list of arguments.
+ *
+ * Valid configurations (src1,src2) -> dst :
+ *
+ * - (U8,U8) -> U8
+ * - (U8,U8) -> S16
+ * - (S16,U8) -> S16
+ * - (U8,S16) -> S16
+ * - (S16,S16) -> S16
+ * - (S32,S32) -> S32
+ * - (F16,F16) -> F16
+ * - (F32,F32) -> F32
+ * - (QASYMM8,QASYMM8) -> QASYMM8
+ * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+ * - (QSYMM16,QSYMM16) -> QSYMM16
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @param[in] policy Policy to use to handle overflow.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref ClSub::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_SUB_H */
diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp
new file mode 100644
index 0000000000..6429451a42
--- /dev/null
+++ b/src/gpu/cl/operators/ClTranspose.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClTranspose.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClTranspose::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::ClTransposeKernel>();
+ k->configure(compile_context, src, dst);
+ _kernel = std::move(k);
+}
+
+Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::ClTransposeKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClTranspose.h b/src/gpu/cl/operators/ClTranspose.h
new file mode 100644
index 0000000000..3642fc23f9
--- /dev/null
+++ b/src/gpu/cl/operators/ClTranspose.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_TRANSPOSE_H
+#define ARM_COMPUTE_CL_TRANSPOSE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClTransposeKernel */
+class ClTranspose : public IClOperator
+{
+public:
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src The src tensor info. Data types supported: All.
+ * @param[in] dst The dst tensor info. Data types supported: Same as @p src
+ */
+ void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClTranspose::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_TRANSPOSE_H */
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp
new file mode 100644
index 0000000000..fbf6442a80
--- /dev/null
+++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClWinogradConv2d.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
+#include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
+#include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "support/Cast.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace
+{
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout)
+{
+ Size2D output_tile = Size2D{};
+
+ const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
+
+ // Check if the input spatial dimensions are smaller than 4
+ const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
+
+ if(kernel_max_dim == 3U)
+ {
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ }
+ else if(kernel_dims == Size2D(3U, 1U))
+ {
+ output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
+ }
+ else
+ {
+ output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
+ }
+ }
+ else if(kernel_max_dim == 5U)
+ {
+ output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
+ kernel_dims.height == 1 ? 1U : 4U);
+ }
+ else if(kernel_max_dim == 7U)
+ {
+ output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
+ kernel_dims.height == 1 ? 1U : 2U);
+ }
+
+ return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+ // Check if we want to configure a Winograd configuration which requires fast math
+ using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+ std::vector<WinogradConfiguration> fast_math_winograd =
+ {
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
+ WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
+ };
+
+ auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+ std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+ return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+ // Get indeces for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size");
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
+ kernel_size,
+ input_dims,
+ conv_info,
+ src->data_layout());
+
+ // Validate input transform
+ const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+ const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info));
+
+ // Validate filter transform
+ const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+ const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
+
+ // Validate batched matrix multiply
+ TensorShape batched_mm_output_shape = input0.tensor_shape();
+ batched_mm_output_shape[0] = input1.tensor_shape()[0];
+ const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+ GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
+
+ // Configure output transform
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
+ return Status{};
+}
+
+} // namespace
+
+ClWinogradConv2d::ClWinogradConv2d()
+ : _batched_mm(),
+ _input_transform(std::make_unique<kernels::ClWinogradInputTransformKernel>()),
+ _filter_transform(std::make_unique<kernels::ClWinogradFilterTransformKernel>()),
+ _output_transform(std::make_unique<kernels::ClWinogradOutputTransformKernel>()),
+ _border_handler(),
+ _input0(),
+ _input1(),
+ _batched_mm_output(),
+ _is_prepared(false),
+ _aux_mem()
+{
+}
+
+ClWinogradConv2d::~ClWinogradConv2d() = default;
+
+void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+ const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
+ ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
+ kernel_size,
+ input_dims,
+ conv_info,
+ src->data_layout());
+
+ _is_prepared = false;
+
+ // Configure input transform
+ _input_transform->configure(compile_context, src, &_input0, winograd_info);
+ _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue());
+
+ // Configure filter transform
+ _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
+
+ // Configure batched matrix multiply
+ _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0,
+ false, false,
+ GEMMLowpOutputStageInfo(),
+ (src->data_type() == DataType::F16)));
+
+ // Configure output transform
+ _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
+
+ _aux_mem = _batched_mm.workspace();
+ const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r)
+ {
+ return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0);
+ }) ?
+ MemoryLifetime::Prepare :
+ MemoryLifetime::Persistent;
+ _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
+ _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size()));
+ _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
+}
+
+Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
+ return Status{};
+}
+
+void ClWinogradConv2d::run(ITensorPack &tensors)
+{
+ const bool is_gemm_reshaped = _aux_mem[3].lifetime == MemoryLifetime::Prepare;
+
+ auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+ auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+ CLAuxTensorHandler input0(offset_int_vec(2), _input0, tensors, true);
+ CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true, is_gemm_reshaped);
+ CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true);
+
+ prepare(tensors);
+
+ // Run input transform
+ ITensorPack pack_it
+ {
+ { TensorType::ACL_SRC, src },
+ { TensorType::ACL_DST, input0.get() },
+ };
+ CLScheduler::get().enqueue_op(_border_handler, pack_it, false);
+ CLScheduler::get().enqueue_op(*_input_transform, pack_it, false);
+
+ // Run batched matrix multiplication
+ ITensorPack pack_mm = tensors;
+ pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
+ pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
+ is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
+ _batched_mm.run(pack_mm);
+
+ // Run output transform
+ ITensorPack pack_ot
+ {
+ { TensorType::ACL_SRC_0, batched_mm_output.get() },
+ { TensorType::ACL_SRC_1, biases },
+ { TensorType::ACL_DST, dst },
+ };
+ CLScheduler::get().enqueue_op(*_output_transform, pack_ot);
+}
+
+void ClWinogradConv2d::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+ ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3)));
+
+ CLAuxTensorHandler input1(_input1, *in1_aux);
+ ITensorPack pack_ft
+ {
+ { TensorType::ACL_SRC, weights },
+ { TensorType::ACL_DST, input1.get() },
+ };
+ // Run filter transform and mark original weights as unused
+ CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
+ weights->mark_as_unused();
+
+ // Prepare GEMM and release reshaped weights if marked unused by ClGemm
+ ITensorPack mm_prepare_pack = tensors;
+ mm_prepare_pack.add_tensor(ACL_SRC_1, input1.get());
+ _batched_mm.prepare(mm_prepare_pack);
+
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
+
+experimental::MemoryRequirements ClWinogradConv2d::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h
new file mode 100644
index 0000000000..eb2f7a72b2
--- /dev/null
+++ b/src/gpu/cl/operators/ClWinogradConv2d.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WINOGRADCONV2D_H
+#define ARM_COMPUTE_CL_WINOGRADCONV2D_H
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/operators/ClGemm.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ITensorInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClWinogradInputTransformKernel;
+class ClWinogradFilterTransformKernel;
+class ClWinogradOutputTransformKernel;
+} // kernels
+/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
+ *
+ * -# @ref kernels::ClWinogradInputTransformKernel
+ * -# @ref kernels::ClWinogradFilterTransformKernel (only once)
+ * -# @ref ClGemm
+ * -# @ref kernels::ClWinogradOutputTransformKernel
+ *
+ */
+class ClWinogradConv2d : public IClOperator
+{
+public:
+ /** Default constructor */
+ ClWinogradConv2d();
+ /** Default destructor */
+ ~ClWinogradConv2d();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ ClWinogradConv2d(const ClWinogradConv2d &) = delete;
+ /** Default move constructor */
+ ClWinogradConv2d(ClWinogradConv2d &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ ClWinogradConv2d &operator=(const ClWinogradConv2d &) = delete;
+ /** Default move assignment operator */
+ ClWinogradConv2d &operator=(ClWinogradConv2d &&) = default;
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:--------------|:------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ *
+ * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
+ * @note Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p src
+ * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p src.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ */
+ void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to ClWinogradConv2d::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+
+ // Inherited method overridden
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ ClGemm _batched_mm;
+ std::unique_ptr<kernels::ClWinogradInputTransformKernel> _input_transform;
+ std::unique_ptr<kernels::ClWinogradFilterTransformKernel> _filter_transform;
+ std::unique_ptr<kernels::ClWinogradOutputTransformKernel> _output_transform;
+ CLFillBorderKernel _border_handler;
+ TensorInfo _input0;
+ TensorInfo _input1;
+ TensorInfo _batched_mm_output;
+ bool _is_prepared;
+ experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WINOGRADCONV2D_H */