aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJakub Sujak <jakub.sujak@arm.com>2023-08-24 14:01:20 +0100
committerJakub Sujak <jakub.sujak@arm.com>2023-09-04 14:41:16 +0000
commit0d27b2ee8d811d66693555ac1e7be44d93e662e2 (patch)
tree8b62a464a8bb9cd46702c8b5a60f3a97e3821b41 /src
parent7ff03b67ba7ce669223f4d807e18fa3efa2f729b (diff)
downloadComputeLibrary-0d27b2ee8d811d66693555ac1e7be44d93e662e2.tar.gz
Remove legacy PostOps code
PostOps was the experimental interface for Dynamic Fusion. It is now replaced by the new Dynamic Fusion interface with code generation using the Compute Kernel Writer. Resolves: COMPMID-6190 Change-Id: I813b48facef2fd6f3aee332588886b4f9b3d33d8 Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10219 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/BUILD.bazel2
-rw-r--r--src/CMakeLists.txt2
-rw-r--r--src/core/CL/CLUtils.cpp114
-rw-r--r--src/core/CL/CLUtils.h91
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h103
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl372
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl1424
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl1399
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h274
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h113
-rw-r--r--src/core/CL/cl_kernels/common/gemm.cl9
-rw-r--r--src/core/experimental/PostOpUtils.h97
-rw-r--r--src/cpu/operators/CpuGemmConv2d.cpp10
-rw-r--r--src/gpu/cl/ClKernelLibrary.cpp29
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp63
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h25
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp55
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h25
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp56
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h29
-rw-r--r--src/gpu/cl/operators/ClConv2d.cpp32
-rw-r--r--src/gpu/cl/operators/ClGemm.cpp16
-rw-r--r--src/gpu/cl/operators/ClGemmConv2d.cpp50
-rw-r--r--src/gpu/cl/operators/ClGemmConv2d.h18
-rw-r--r--src/graph/DataLayerVisitor.cpp10
-rw-r--r--src/graph/INode.cpp15
-rw-r--r--src/graph/INodeVisitor.cpp10
-rw-r--r--src/graph/backends/CL/CLFunctionsFactory.cpp6
-rw-r--r--src/graph/backends/CL/CLNodeValidator.cpp4
-rw-r--r--src/graph/mutators/NodeFusionMutator.cpp442
-rw-r--r--src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp138
-rw-r--r--src/graph/nodes/FusedConvolutionWithPostOpNode.cpp153
-rw-r--r--src/graph/printers/DotGraphPrinter.cpp18
-rw-r--r--src/runtime/CL/functions/CLConvolutionLayer.cpp40
-rw-r--r--src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp33
35 files changed, 143 insertions, 5134 deletions
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index f508b7ee2e..a02739f339 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -72,8 +72,6 @@ filegroup(
"graph/nodes/FlattenLayerNode.cpp",
"graph/nodes/FullyConnectedLayer.cpp",
"graph/nodes/FusedConvolutionBatchNormalizationNode.cpp",
- "graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp",
- "graph/nodes/FusedConvolutionWithPostOpNode.cpp",
"graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp",
"graph/nodes/GenerateProposalsLayerNode.cpp",
"graph/nodes/InputNode.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 76409239ea..39fba860fa 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -73,8 +73,6 @@ target_sources(
graph/nodes/FlattenLayerNode.cpp
graph/nodes/FullyConnectedLayer.cpp
graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
- graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
- graph/nodes/FusedConvolutionWithPostOpNode.cpp
graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
graph/nodes/GenerateProposalsLayerNode.cpp
graph/nodes/InputNode.cpp
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 03f78697bc..7e56a3ba18 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -23,16 +23,14 @@
*/
#include "src/core/CL/CLUtils.h"
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/CL/CLCompileContext.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/StringUtils.h"
#include "support/StringSupport.h"
-#include "src/core/experimental/PostOpUtils.h"
-
namespace arm_compute
{
cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType image_type)
@@ -40,7 +38,7 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
const cl::Context &ctx = CLKernelLibrary::get().context();
- const cl::Buffer &buffer = tensor->cl_buffer();
+ const cl::Buffer &buffer = tensor->cl_buffer();
const ITensorInfo *info = tensor->info();
ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(),
"Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement");
@@ -113,112 +111,4 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
return cl::Image2D(cl_image);
}
-
-namespace experimental
-{
-PostOpCLKernelUtils::PostOpCLKernelUtils(const Config &supported_config)
- : _supported_config(supported_config)
-{
- ARM_COMPUTE_ERROR_ON_MSG(supported_config.empty(), "Empty PostOp CL kernel support configuration is not allowed");
- for(auto it = _supported_config.begin(); it != _supported_config.end(); ++it)
- {
- auto post_op_sequence = it->first;
- auto post_op_slots = std::get<1>(it->second);
- ARM_COMPUTE_ERROR_ON_MSG(post_op_sequence.size() != post_op_slots.size(), "The number of PostOps must be the same as that of the assigned slots");
- }
-}
-
-bool PostOpCLKernelUtils::are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops)
-{
- for(const auto &op : post_ops.get_list())
- {
- for(const auto &tensor : op->arguments())
- {
- const TensorShape &out_shape = TensorShape::broadcast_shape(dst->tensor_shape(), (*tensor)->tensor_shape());
- // All post ops must be elementwise and must not alter the shape of the original dst tensor after broadcasting
- if(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0))
- {
- return false;
- }
- // NOTE: Kernel limitation: currently only the following broadcasting types are supported:
- // 1. Post op arg is scalar, broadcast in both first and second dims
- // 2. Post op arg is of shape: second dim=1, first dim=N, broadcast only in second dim
- // This means this case: Post op arg is of shape: second dim=M, first dim=1, broadcast only in first dim, is NOT supported
- if(dst->dimension(0) > 1 && dst->dimension(1) > 1 && (*tensor)->dimension(0) == 1 && (*tensor)->dimension(1) > 1)
- {
- return false;
- }
- }
- }
- return true;
-}
-
-bool PostOpCLKernelUtils::is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const
-{
- if(post_ops.size() == 0)
- {
- return true; // Always support cases where no post op is specified
- }
- const auto post_op_sequence = get_post_op_sequence(post_ops);
-
- return _supported_config.find(post_op_sequence) != _supported_config.end();
-}
-
-void PostOpCLKernelUtils::set_post_ops_cl_build_options(CLBuildOptions &build_opts, const PostOpList<ITensorInfo *> &post_ops) const
-{
- const auto post_op_sequence = get_post_op_sequence(post_ops);
- const auto slots = std::get<1>(_supported_config.at(post_op_sequence));
- for(size_t post_op_id = 0; post_op_id < post_ops.size(); ++post_op_id)
- {
- const auto &post_op = post_ops.get_list().at(post_op_id);
- const auto slot_prefix = "-DP" + support::cpp11::to_string(slots[post_op_id]);
- if(post_op->type() == experimental::PostOpType::Activation)
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<ITensorInfo *> *>(post_op.get());
- const auto act_type = slot_prefix + "_ACTIVATION_TYPE=" + lower_string(string_from_activation_func(_post_op->_act_info.activation()));
- const auto act_a_val = slot_prefix + "_ACTIVATION_A_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.a());
- const auto act_b_val = slot_prefix + "_ACTIVATION_B_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.b());
- build_opts.add_option(act_type);
- build_opts.add_option(act_a_val);
- build_opts.add_option(act_b_val);
- }
- else if(post_op->type() == experimental::PostOpType::Eltwise_Add)
- {
- size_t arg_id = 1;
- const auto eltwise_op = slot_prefix + "_ELTWISE_OP=ADD" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
- build_opts.add_option(eltwise_op);
- for(const auto &tensor : post_op->arguments())
- {
- const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
- const auto width = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
- build_opts.add_option(height);
- build_opts.add_option(width);
- ++arg_id;
- }
- }
- else if(post_op->type() == experimental::PostOpType::Eltwise_PRelu)
- {
- size_t arg_id = 1;
- const auto eltwise_op = slot_prefix + "_ELTWISE_OP=PRELU" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
- build_opts.add_option(eltwise_op);
- for(const auto &tensor : post_op->arguments())
- {
- const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
- const auto width = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
- build_opts.add_option(height);
- build_opts.add_option(width);
- ++arg_id;
- }
- }
- }
-}
-
-void PostOpCLKernelUtils::set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const
-{
- const auto post_op_sequence = get_post_op_sequence(post_ops);
- const auto postfix = std::get<0>(_supported_config.at(post_op_sequence));
- kernel_name += postfix;
-}
-} // namespace experimental
-
} // namespace arm_compute
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index e3f12d4b53..f0e79bccfc 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -22,11 +22,10 @@
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_CLUTILS_H
-#define ARM_COMPUTE_CL_CLUTILS_H
+#ifndef ACL_SRC_CORE_CL_CLUTILS_H
+#define ACL_SRC_CORE_CL_CLUTILS_H
#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/experimental/IPostOp.h"
#include <map>
@@ -74,88 +73,6 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
* @return cl::Image2D object
*/
cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type);
+} // namespace arm_compute
-namespace experimental
-{
-/** @name (EXPERIMENTAL_POST_OPS)
- * @{
- */
-
-/** Manage validation, building and configurations of PostOp CL kernels */
-class PostOpCLKernelUtils final
-{
-public:
- /** CL kernel name postfix for post ops */
- using NamePostfix = std::string;
- /** CL kernels that supports post ops assign each post op to a 'slot', in accordance with the postfix
- * For example, for a kernel with postfix '_act_prelu_eltwiseadd', there are 3 slots
- * slot 1: (unary) activation, slot 2: pRelu, slot 3: elementwise addition
- *
- * Some kernels may allow some slots to be optional, to support multiple combinations of post op sequences.
- * In such cases, we need to explicitly set up a mapping between each post op and the slots for that kernel.
- * For example, suppose we have 2 kernels with postfixes: _eltwiseadd_prelu, _act_eltwiseadd_act_prelu, where the activations in the
- * second kernel are optional. Say we want to support an eltwise addition, followed by a prelu (sequence { eltwiseadd, prelu }).
- * Now we can choose which one of the 2 kernels to use, since they both support this post op sequence.
- * We can either:
- * 1. assign the elementwise to slot 1 and prelu to slot 2 of kernel 1
- * { { Eltwise_Add, PRelu } -> {"_eltwise_act", {1, 2} } } or
- * 2. assign the elementwise to slot 2 and prelu to slot 4 of kernel 1
- * { { Eltwise_Add, PRelu } -> {"_act_eltwiseadd_act_prelu", {2, 4} } }
- */
- using Slots = std::vector<unsigned int>;
- using Config = std::map<PostOpTypeSequence, std::tuple<NamePostfix, Slots>>;
-
-public:
- explicit PostOpCLKernelUtils(const Config &config);
-
- /** Check if post op argument tensor shapes are compliant
- * All post ops must not alter the shape of the original dst tensor (even after broadcasting)
- *
- * @param[in] dst Dst tensor to apply the post ops to
- * @param[in] post_ops Post ops
- *
- * @return true if shapes are compliant and false otherwise
- */
- static bool are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops);
- /** Check if the post op sequence is supported in the current configuration
- *
- * @param[in] post_ops Post ops
- *
- * @return true if the post op sequence is supported and false otherwise
- */
- bool is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const;
- /** Helper function to set PostOp related build options
- * @note Convention
- * 1. Each post op "slot" is prefixed with "P<slot number>", followed by the usual parameters for that post op.
- * E.g. If the first slot is an activation, we need to pass 3 definitions in this way:
- * -P1_ACTIVATION_TYPE=... -P1_ACTIVATION_A_VAL=... -P1_ACTIVATION_B_VAL=...
- *
- * 2. For multi-ary post ops, to pass the position of the previous op's dest tensor,
- * we append "_X_POS_<pos>" to the post op type.
- * E.g. for a single post op add(dst, x), where dst is the result of the main op.
- * In this case, the position of the previous op's dest is 0, so we pass
- * -P1_ELTWISE_OP=ADD_X_POS_0
- *
- * @param[out] built_opts OpenCL kernel build options
- * @param[in] post_ops Post ops
- *
- */
- void set_post_ops_cl_build_options(CLBuildOptions &built_opts, const PostOpList<ITensorInfo *> &post_ops) const;
- /** Helper function to set PostOp kernel name
- *
- * @param[out] kernel_name OpenCL kernel name
- * @param[in] post_ops Post ops
- *
- */
- void set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const;
-
-private:
- Config _supported_config{};
-};
-/** @} */ // end of group (EXPERIMENTAL_POST_OPS)
-
-} // namespace experimental
-
-} // arm_compute
-
-#endif /* ARM_COMPUTE_CL_CLUTILS_H */
+#endif // ACL_SRC_CORE_CL_CLUTILS_H
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
deleted file mode 100644
index 2c2d60ed13..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h"
-
-/** (EXPERIMENTAL_POST_OPS) Post Op expansions for the post op sequence:
- * act (optional): POST_OP1_ACTIVATION_OPTIONAL
- * eltwise_op : POST_OP2_ELTWISE_OP
- * act (optional): POST_OP3_ACTIVATION_OPTIONAL
- */
-
-/** Post Op 1: Activation Block (Optional)
- * @name POST_OP1_ACTIVATION_OPTIONAL
- * Toggled by -DP1_ACTIVATION_TYPE
- * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
- * @{
- */
-#if defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
- MIXED_PRECISION_ACTIVATION_BLOCK(N, P1_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
-#else // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
-#endif // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-/** @} */ // end of group POST_OP1_ACTIVATION_OPTIONAL
-
-/** Post Op 2: Eltwise Op Block
- * Handles both broadcasting and non-broadcasting cases
- * @name POST_OP2_ELTWISE_OP
- *
- * @param[in] P2_ELTWISE_ARG1_HEIGHT Height (number of rows) of the @ref ELTWISE_OPERAND_NAME tensor
- * @param[in] P2_ELTWISE_ARG1_WIDTH Width (number of columns) of the @ref ELTWISE_OPERAND_NAME tensor
- * @param[in] OP The elementwise post op
- * @param[in] M0 The number of consecutive rows
- * @param[in] N0 The number of consecutive columns
- * @param[in] BASENAME The basename of the result variables
- * @param[in] ELTWISE_OPERAND_NAME The basename of the other operand variables
- * @param[in] ELTWISE_OPERAND_ROW The starting row of the other operand variables. Required as different boundary handling strategies are used by different kernels
- * E.g. reshaped_only_rhs and native kernels shifts rows (by using COMPUTE_M0_START_ROW) to handle boundary rows,
- * whereas reshaped kernels do not shift rows
- * @param[in] DATA_TYPE Data type of the result variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] ZERO Zero vector for z offset
- * @param[in] PARTIAL_LOAD_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
- * @param[in] PARTIAL_LOAD_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
- * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_LOAD_M0 rather than M0.
- * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_LOAD_N0 rather than N0.
- * @{
- */
-#if defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#if P2_ELTWISE_ARG1_HEIGHT == 1
-#if P2_ELTWISE_ARG1_WIDTH == 1 // Case 1: Broadcasting in both X and Y; op2 arg tile shape[YxX] == [1x1]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
- VEC_DATA_TYPE(DATA_TYPE, 1) \
- ELTWISE_OPERAND_NAME##0 = VLOAD(1)(0, (__global DATA_TYPE *)ELTWISE_OPERAND_NAME##_addr); \
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, 1, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#else // P2_ELTWISE_ARG1_WIDTH == 1; Case 2: Broadcasting in only Y; op2 arg tile shape[YxX] == [1xN0]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, 1, PARTIAL_LOAD_N0, false, PARTIAL_COND_X); \
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#endif // P2_ELTWISE_ARG1_WIDTH == 1
-#else // P2_ELTWISE_ARG1_HEIGHT == 1; Case 3: No broadcasting; op2 arg tile shape[YxX] == [M0xN0]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (ELTWISE_OPERAND_ROW * ELTWISE_OPERAND_NAME##_stride_y) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X); \
- MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#endif // P2_ELTWISE_ARG1_HEIGHT == 1
-#endif // defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-/** @} */ // end of group POST_OP2_ELTWISE_OP
-/** Post Op 3: Activation Block (Optional)
- * @name POST_OP3_ACTIVATION_OPTIONAL
- * Toggled by -DP3_ACTIVATION_TYPE
- * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
- * @{
- */
-#if defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
- MIXED_PRECISION_ACTIVATION_BLOCK(N, P3_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
-#else // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
-#endif // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-/** @} */ // end of group POST_OP3_ACTIVATION_OPTIONAL
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
deleted file mode 100644
index 22ae098772..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h"
-#include "common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h"
-#include "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h"
-
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_native kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#define VFMA(a, b, c) \
- ({ \
- c = fma(a, b, c); \
- })
-
-#if M0 == 1
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- })
-#elif M0 == 2 // M0 == 2
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- })
-#elif M0 == 3 // M0 == 3
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- })
-#elif M0 == 4 // M0 == 4
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- })
-#elif M0 == 5 // M0 == 5
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- })
-#elif M0 == 6 // M0 == 6
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- })
-#elif M0 == 7 // M0 == 7
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- })
-#elif M0 == 8 // M0 == 8
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
- })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-#if defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_native, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_native_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z,
- const int M,
- const int N,
- const int K
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- )
-{
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
- // RHS offset and step X
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
- // Compute RHS matrix address
- uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
- // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
-
- int i = 0;
-#if K0 > 1
- for(; i <= (K - K0); i += K0)
- {
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
- // Load values from RHS matrix
- LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
-
- RHS_VFMA_M0xN0(0, a, b0, c);
- RHS_VFMA_M0xN0(1, a, b1, c);
-#if K0 > 2
- RHS_VFMA_M0xN0(2, a, b2, c);
-#endif // K0 > 2
-#if K0 > 3
- RHS_VFMA_M0xN0(3, a, b3, c);
-#endif // K0 > 3
-#if K0 > 4
- RHS_VFMA_M0xN0(4, a, b4, c);
- RHS_VFMA_M0xN0(5, a, b5, c);
- RHS_VFMA_M0xN0(6, a, b6, c);
- RHS_VFMA_M0xN0(7, a, b7, c);
-#endif // K0 > 4
-#if K0 > 8
- RHS_VFMA_M0xN0(8, a, b8, c);
- RHS_VFMA_M0xN0(9, a, b9, c);
- RHS_VFMA_M0xN0(A, a, bA, c);
- RHS_VFMA_M0xN0(B, a, bB, c);
- RHS_VFMA_M0xN0(C, a, bC, c);
- RHS_VFMA_M0xN0(D, a, bD, c);
- RHS_VFMA_M0xN0(E, a, bE, c);
- RHS_VFMA_M0xN0(F, a, bF, c);
-#endif // K0 > 8
-
- lhs_offset += K0 * sizeof(DATA_TYPE);
- rhs_offset += K0 * rhs_stride_y;
- }
-#endif // K0 > 1
- // Left-over accumulations
- for(; i < K; ++i)
- {
- // Load values from LHS matrix
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
-#if M0 > 1
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
-#endif // M0 > 1
-#if M0 > 2
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
-#endif // M0 > 2
-#if M0 > 3
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
-#endif // M0 > 3
-#if M0 > 4
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
-#endif // M0 > 4
-#if M0 > 5
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
-#endif // M0 > 5
-#if M0 > 6
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
-#endif // M0 > 6
-#if M0 > 7
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
-#endif // M0 > 7
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
- RHS_VFMA_M0xN0(0, a, b, c);
-
- lhs_offset += sizeof(DATA_TYPE);
- rhs_offset += rhs_stride_y;
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- const bool cond_y = y == 0;
- const bool cond_x = ((x + 1) * N0 >= N);
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
deleted file mode 100644
index 89577e9ebd..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
+++ /dev/null
@@ -1,1424 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "fp_post_ops_act_eltwise_op_act.h"
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped kernel */
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#if defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c += a.s0 * b.s0; \
- c += a.s1 * b.s1; \
- })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c += a.s0 * b.s0; \
- c += a.s1 * b.s1; \
- c += a.s2 * b.s2; \
- })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c += a.s0 * b.s0; \
- c += a.s1 * b.s1; \
- c += a.s2 * b.s2; \
- c += a.s3 * b.s3; \
- })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c += a.s0 * b.s0; \
- c += a.s1 * b.s1; \
- c += a.s2 * b.s2; \
- c += a.s3 * b.s3; \
- c += a.s4 * b.s4; \
- c += a.s5 * b.s5; \
- c += a.s6 * b.s6; \
- c += a.s7 * b.s7; \
- })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c += a.s0 * b.s0; \
- c += a.s1 * b.s1; \
- c += a.s2 * b.s2; \
- c += a.s3 * b.s3; \
- c += a.s4 * b.s4; \
- c += a.s5 * b.s5; \
- c += a.s6 * b.s6; \
- c += a.s7 * b.s7; \
- c += a.s8 * b.s8; \
- c += a.s9 * b.s9; \
- c += a.sA * b.sA; \
- c += a.sB * b.sB; \
- c += a.sC * b.sC; \
- c += a.sD * b.sD; \
- c += a.sE * b.sE; \
- c += a.sF * b.sF; \
- })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#else // defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- c = fma(a.s2, b.s2, c); \
- })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- c = fma(a.s2, b.s2, c); \
- c = fma(a.s3, b.s3, c); \
- })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- c = fma(a.s2, b.s2, c); \
- c = fma(a.s3, b.s3, c); \
- c = fma(a.s4, b.s4, c); \
- c = fma(a.s5, b.s5, c); \
- c = fma(a.s6, b.s6, c); \
- c = fma(a.s7, b.s7, c); \
- })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- c = fma(a.s2, b.s2, c); \
- c = fma(a.s3, b.s3, c); \
- c = fma(a.s4, b.s4, c); \
- c = fma(a.s5, b.s5, c); \
- c = fma(a.s6, b.s6, c); \
- c = fma(a.s7, b.s7, c); \
- c = fma(a.s8, b.s8, c); \
- c = fma(a.s9, b.s9, c); \
- c = fma(a.sA, b.sA, c); \
- c = fma(a.sB, b.sB, c); \
- c = fma(a.sC, b.sC, c); \
- c = fma(a.sD, b.sD, c); \
- c = fma(a.sE, b.sE, c); \
- c = fma(a.sF, b.sF, c); \
- })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#endif // defined(MIXED_PRECISION)
-
-#if defined(ARM_DOT_K0XN0)
-#undef ARM_DOT_K0XN0
-#endif // defined(ARM_DOT_K0XN0)
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(a, b, c) \
- ({ \
- ARM_DOT_K0((a), (b##0), (c.s0)); \
- ARM_DOT_K0((a), (b##1), (c.s1)); \
- })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(a, b, c) \
- ({ \
- ARM_DOT_K0((a), (b##0), (c.s0)); \
- ARM_DOT_K0((a), (b##1), (c.s1)); \
- ARM_DOT_K0((a), (b##2), (c.s2)); \
- })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(a, b, c) \
- ({ \
- ARM_DOT_K0((a), (b##0), (c.s0)); \
- ARM_DOT_K0((a), (b##1), (c.s1)); \
- ARM_DOT_K0((a), (b##2), (c.s2)); \
- ARM_DOT_K0((a), (b##3), (c.s3)); \
- })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(a, b, c) \
- ({ \
- ARM_DOT_K0((a), (b##0), (c.s0)); \
- ARM_DOT_K0((a), (b##1), (c.s1)); \
- ARM_DOT_K0((a), (b##2), (c.s2)); \
- ARM_DOT_K0((a), (b##3), (c.s3)); \
- ARM_DOT_K0((a), (b##4), (c.s4)); \
- ARM_DOT_K0((a), (b##5), (c.s5)); \
- ARM_DOT_K0((a), (b##6), (c.s6)); \
- ARM_DOT_K0((a), (b##7), (c.s7)); \
- })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(a, b, c) \
- ({ \
- ARM_DOT_K0((a), (b##0), (c.s0)); \
- ARM_DOT_K0((a), (b##1), (c.s1)); \
- ARM_DOT_K0((a), (b##2), (c.s2)); \
- ARM_DOT_K0((a), (b##3), (c.s3)); \
- ARM_DOT_K0((a), (b##4), (c.s4)); \
- ARM_DOT_K0((a), (b##5), (c.s5)); \
- ARM_DOT_K0((a), (b##6), (c.s6)); \
- ARM_DOT_K0((a), (b##7), (c.s7)); \
- ARM_DOT_K0((a), (b##8), (c.s8)); \
- ARM_DOT_K0((a), (b##9), (c.s9)); \
- ARM_DOT_K0((a), (b##A), (c.sA)); \
- ARM_DOT_K0((a), (b##B), (c.sB)); \
- ARM_DOT_K0((a), (b##C), (c.sC)); \
- ARM_DOT_K0((a), (b##D), (c.sD)); \
- ARM_DOT_K0((a), (b##E), (c.sE)); \
- ARM_DOT_K0((a), (b##F), (c.sF)); \
- })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
- if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
- (get_global_id(2) * lhs_stride_z);
-
- // Compute RHS matrix address
- __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_addr += get_global_id(2) * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
- for(int i = 0; i < K; i += K0)
- {
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
- // Load values from RHS matrix
- LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
- // Accumulate
- ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
- lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
- rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
- // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
- const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
- const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
- 2) * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- __read_only image2d_t rhs_img,
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
- // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
- // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
- if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
- (get_global_id(2) * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else // defined(MATRIX_B_DEPTH)
- const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
- // Compute RHS matrix coordinates
- uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
- const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
- for(int i = 0; i < K; i += K0)
- {
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
- // Load values from RHS matrix stored in a cl_image
- REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
- LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
- // Accumulate
- ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
- lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-
- x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
- // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
- const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
- const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
- 2) * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(LHS_TRANSPOSE)
-
-#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
-
-#if defined(MIXED_PRECISION)
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#else // defined(MIXED_PRECISION
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#endif // defined(MIXED_PRECISION)
-
-#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \
- ({ \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
- })
-#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \
- ({ \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
- })
-#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \
- ({ \
- ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
- })
-#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \
- ({ \
- ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
- })
-#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \
- ({ \
- ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
- })
-
-// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
-// a is the column-vector (transposed)
-// b is the row-vector (not transposed)
-// C is the output matrix
-// Lower case is a vector (a, b)
-// Upper case is a matrix (C)
-#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
-
-#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
- })
-#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
- })
-#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
- })
-#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
- })
-#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
- })
-#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
- })
-
-// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
-// The dimensions for this matrix multiplications are defined through M0, N0 and K0
-// The dimensions supported are:
-// M0: 1, 2, 3, 4, 8
-// N0: 1, 2, 3, 4, 8, 16
-// K0: 1, 2, 3, 4, 8, 16
-// This macro calls the vector-by-matrix macro K0 times
-// A, B and C are matrices
-#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
- CONCAT(ARM_MM_T_NT_M0xN0x, K0) \
- (M0, N0, TYPE, A, B, C)
-
-#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M Number of rows in LHS matrix not reshaped.
- * @param[in] N Number of columns in RHS matrix not reshaped.
- * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#endif // defined(RHS_INTERLEAVE)
-
- const uint x = get_global_id(0);
- const uint y = get_global_id(1);
- const uint z = get_global_id(2);
-
- // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
- const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
- const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
- // Compute RHS matrix address
- __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_addr += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
- __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
- __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
-
- for(int i = 0; i < K; i += K0)
- {
- VEC_DATA_TYPE(DATA_TYPE, M0)
- a0;
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
-#if K0 > 1
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
- lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-#ifndef RHS_INTERLEAVE
- rhs += (N0 * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
- 2) * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- __read_only image2d_t rhs_img,
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
- // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#endif // defined(RHS_INTERLEAVE)
-
- const uint x = get_global_id(0);
- const uint y = get_global_id(1);
- const uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else // defined(MATRIX_B_DEPTH)
- const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
- // Compute RHS matrix coordinates
- uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
- const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
- __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-
- for(int i = 0; i < K; i += K0)
- {
- VEC_DATA_TYPE(DATA_TYPE, M0)
- a0;
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
-#if K0 > 1
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
- lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
- x_rhs += K0 * RHS_STEP_X;
-#ifndef RHS_INTERLEAVE
- x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
- // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
- const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
- const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#endif // defined(LHS_TRANSPOSE)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
deleted file mode 100644
index 09ddcde043..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
+++ /dev/null
@@ -1,1399 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "fp_post_ops_act_eltwise_op_act.h"
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped_only_rhs kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#define CONCAT(a, b) a##b
-
-#define ARM_DOT1(a, b, c) \
- ({ \
- c = fma(a, b, c); \
- })
-#define ARM_DOT2(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- })
-#define ARM_DOT3(a, b, c) \
- ({ \
- ARM_DOT2(a, b, c); \
- c = fma((a.s2), (b.s2), c); \
- })
-#define ARM_DOT4(a, b, c) \
- ({ \
- ARM_DOT3(a, b, c); \
- c = fma((a.s3), (b.s3), c); \
- })
-#define ARM_DOT8(a, b, c) \
- ({ \
- ARM_DOT4((a.lo), (b.lo), c); \
- ARM_DOT4((a.hi), (b.hi), c); \
- })
-#define ARM_DOT16(a, b, c) \
- ({ \
- ARM_DOT8((a.lo), (b.lo), c); \
- ARM_DOT8((a.hi), (b.hi), c); \
- })
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(k0, a, b, c) \
- ({ \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##0), (c.s0)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##1), (c.s1)); \
- })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(k0, a, b, c) \
- ({ \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##0), (c.s0)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##1), (c.s1)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##2), (c.s2)); \
- })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(k0, a, b, c) \
- ({ \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##0), (c.s0)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##1), (c.s1)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##2), (c.s2)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##3), (c.s3)); \
- })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(k0, a, b, c) \
- ({ \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##0), (c.s0)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##1), (c.s1)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##2), (c.s2)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##3), (c.s3)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##4), (c.s4)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##5), (c.s5)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##6), (c.s6)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##7), (c.s7)); \
- })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(k0, a, b, c) \
- ({ \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##0), (c.s0)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##1), (c.s1)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##2), (c.s2)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##3), (c.s3)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##4), (c.s4)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##5), (c.s5)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##6), (c.s6)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##7), (c.s7)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##8), (c.s8)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##9), (c.s9)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##A), (c.sA)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##B), (c.sB)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##C), (c.sC)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##D), (c.sD)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##E), (c.sE)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##F), (c.sF)); \
- })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
- const bool cond_y = y == 0;
- const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
- // Compute RHS reshaped matrix address
- uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
- // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
-
- int i = 0;
- for(; i <= (K - K0); i += K0)
- {
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
- // Load values from RHS reshaped matrix
- LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
- // Accumulate
- ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
- lhs_offset += K0 * sizeof(DATA_TYPE);
- rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
- }
-
- // Left-over accumulations
- for(; i < K; ++i)
- {
- // Load values from LHS matrix
- LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
- // Load values from RHS reshaped matrix
- LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
- // Accumulate
- ARM_DOT_K0XN0(1, a0, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(1, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(1, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(1, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(1, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(1, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(1, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(1, a7, b, c7);
-#endif // M0 > 7
-
- lhs_offset += sizeof(DATA_TYPE);
- rhs_offset += sizeof(DATA_TYPE);
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M Number of rows in LHS matrix not reshaped.
- * @param[in] N Number of columns in RHS matrix not reshaped.
- * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- __read_only image2d_t rhs_img,
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
- const uint LEFTOVER_K = K % K0;
-
- // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
- const bool cond_y = y == 0;
- const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else // defined(MATRIX_B_DEPTH)
- const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
- // Compute RHS matrix coordinates
- uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
- const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
- // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
- int i = 0;
- for(; i <= (K - K0); i += K0)
- {
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
- // Load values from RHS matrix stored in a cl_image
- REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
- LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
- // Accumulate
- ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
- lhs_offset += K0 * sizeof(DATA_TYPE);
- x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
- }
-
- if(LEFTOVER_K != 0)
- {
- // Note: We cannot read out-of-bound elements from the RHS matrix because
- // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
- union UNION_VEC_TYPE
- {
- DATA_TYPE s[K0];
- VEC_DATA_TYPE(DATA_TYPE, K0)
- v;
- };
-
- union UNION_VEC_TYPE a0 = {.v = 0 };
-#if M0 > 1
- union UNION_VEC_TYPE a1 = {.v = 0 };
-#endif // M0 > 1
-#if M0 > 2
- union UNION_VEC_TYPE a2 = {.v = 0 };
-#endif // M0 > 2
-#if M0 > 3
- union UNION_VEC_TYPE a3 = {.v = 0 };
-#endif // M0 > 3
-#if M0 > 4
- union UNION_VEC_TYPE a4 = {.v = 0 };
-#endif // M0 > 4
-#if M0 > 5
- union UNION_VEC_TYPE a5 = {.v = 0 };
-#endif // M0 > 5
-#if M0 > 6
- union UNION_VEC_TYPE a6 = {.v = 0 };
-#endif // M0 > 6
-#if M0 > 7
- union UNION_VEC_TYPE a7 = {.v = 0 };
-#endif // M0 > 7
-
- REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-
- // Load from RHS matrix
- LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
- // Load from LHS matrix
- for(int k = 0; k < LEFTOVER_K; ++k)
- {
- a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
-#if M0 > 1
- a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
-#endif // M0 > 1
-#if M0 > 2
- a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
-#endif // M0 > 2
-#if M0 > 3
- a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
-#endif // M0 > 3
-#if M0 > 4
- a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
-#endif // M0 > 4
-#if M0 > 5
- a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
-#endif // M0 > 5
-#if M0 > 6
- a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
-#endif // M0 > 6
-#if M0 > 7
- a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
-#endif // M0 > 7
-
- lhs_offset += sizeof(DATA_TYPE);
- }
-
- // Accumulate
- ARM_DOT_K0XN0(K0, a0.v, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(K0, a1.v, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(K0, a2.v, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(K0, a3.v, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(K0, a4.v, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(K0, a5.v, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(K0, a6.v, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(K0, a7.v, b, c7);
-#endif // M0 > 7
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#define VFMA(a, b, c) \
- ({ \
- c = fma(a, b, c); \
- })
-
-#if M0 == 1
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- })
-#elif M0 == 2 // M0 == 2
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- })
-#elif M0 == 3 // M0 == 3
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- })
-#elif M0 == 4 // M0 == 4
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- })
-#elif M0 == 5 // M0 == 5
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- })
-#elif M0 == 6 // M0 == 6
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- })
-#elif M0 == 7 // M0 == 7
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- })
-#elif M0 == 8 // M0 == 8
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
- })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M Number of rows in LHS matrix not reshaped.
- * @param[in] N Number of columns in RHS matrix not reshaped.
- * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
- const bool cond_y = y == 0;
- const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
- // Compute RHS reshaped matrix address
- uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
-
- int i = 0;
- for(; i <= (K - K0); i += K0)
- {
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
-
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(0, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(4, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(5, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(6, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(8, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(9, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(A, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(B, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(C, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(D, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(E, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
- lhs_offset += K0 * sizeof(DATA_TYPE);
- rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
- }
-
- // Left-over accumulations
- for(; i < K; ++i)
- {
- // Load values from LHS matrix
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
-
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(0, a, b0, c);
-
- lhs_offset += sizeof(DATA_TYPE);
- rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M Number of rows in LHS matrix not reshaped.
- * @param[in] N Number of columns in RHS matrix not reshaped.
- * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- __read_only image2d_t rhs_img,
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
- const bool cond_y = y == 0;
- const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else // defined(MATRIX_B_DEPTH)
- const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
- // Compute RHS matrix coordinates
- uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
- const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
- int i = 0;
- for(; i <= (K - K0); i += K0)
- {
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
-
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(0, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(4, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(5, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(6, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(8, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(9, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(A, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(B, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(C, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(D, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(E, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
- lhs_offset += K0 * sizeof(DATA_TYPE);
- x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
- }
-
- // Left-over accumulations
- for(; i < K; ++i)
- {
- // Load values from LHS matrix
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
- VFMA_M0xN0(0, a, b0, c);
-
- lhs_offset += sizeof(DATA_TYPE);
- x_rhs += RHS_STEP_X;
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
deleted file mode 100644
index b584251c2a..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** (EXPERIMENTAL_POST_OPS) Macros for (binary) elementwise operations */
-
-/** List of (binary) elementwise operators, accounting for the argument position of argument X
- * @note X_Pos denotes the position of argument X. e.g. X_POS_0 means X is in the first place whereas X_POS_1 means X is in the second place
- * @name elementwise_post_ops
- * @{
- */
-#if defined(N0) && !defined(VEC_SIZE)
-#define VEC_SIZE N0
-#endif // defined(N0) && !defined(VEC_SIZE)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-
-#define ADD_X_POS_0(x, y) (x) + (y)
-#define SUB_X_POS_0(x, y) (x) - (y)
-#define MAX_X_POS_0(x, y) max(x, y)
-#define MIN_X_POS_0(x, y) min(x, y)
-#define SQUARED_DIFF_X_POS_0(x, y) (x - y) * (x - y)
-#define POWER_X_POS_0(x, y) pow(x, y)
-#if VEC_SIZE == 1
-#define PRELU_X_POS_0(x, y) (x > 0 ? x : x * y)
-#else // VEC_SIZE == 1
-
-#if defined(MIXED_PRECISION)
-#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
-#else // MIXED_PRECISION
-#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
-#endif // MIXED_PRECISION
-
-#endif // VEC_SIZE == 1
-#define DIV_X_POS_0(x, y) (x / y)
-#define AND_X_POS_0(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
-#define OR_X_POS_0(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
-
-#define ADD_X_POS_1(x, y) ADD_X_POS_0(x, y)
-#define SUB_X_POS_1(x, y) (y) - (x)
-#define MAX_X_POS_1(x, y) MAX_X_POS_0(x, y)
-#define MIN_X_POS_1(x, y) MIN_X_POS_0(x, y)
-#define SQUARED_DIFF_X_POS_1(x, y) SQUARED_DIFF_X_POS_0(x, y)
-#define POWER_X_POS_1(x, y) pow(y, x)
-#if VEC_SIZE == 1
-#define PRELU_X_POS_1(x, y) (y > 0 ? y : y * x)
-#else // VEC_SIZE == 1
-
-#if defined(MIXED_PRECISION)
-#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
-#else // MIXED_PRECISION
-#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
-#endif // MIXED_PRECISION
-
-#endif // VEC_SIZE == 1
-#define DIV_X_POS_1(x, y) (y / x)
-#define AND_X_POS_1(x, y) AND_X_POS_0(x, y)
-#define OR_X_POS_1(x, y) OR_X_POS_0(x, y)
-
-// By default use the order of the arguments as they are passed in, ie. _X_POS_0
-#define ADD(x, y) ADD_X_POS_0(x, y)
-#define SUB(x, y) SUB_X_POS_0(x, y)
-#define MAX(x, y) MAX_X_POS_0(x, y)
-#define MIN(x, y) MIN_X_POS_0(x, y)
-#define SQUARED_DIFF(x, y) SQUARED_DIFF_X_POS_0(x, y)
-#define POWER(x, y) POWER_X_POS_0(x, y)
-#define PRELU(x, y) PRELU_X_POS_0(x, y)
-#define DIV(x, y) DIV_X_POS_0(x, y)
-#define AND(x, y) AND_X_POS_0(x, y)
-#define OR(x, y) OR_X_POS_0(x, y)
-
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
-/** @} */ // end of group elementwise_post_ops
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name ELTWISE_OP_ROW_n
- *
- * @param[in] OP The elementwise post op
- * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \
- OPERAND1##0 = OP(OPERAND1##0, OPERAND2##0);
-
-#define ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \
- OPERAND1##1 = OP(OPERAND1##1, OPERAND2##1);
-
-#define ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \
- OPERAND1##2 = OP(OPERAND1##2, OPERAND2##2);
-
-#define ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \
- OPERAND1##3 = OP(OPERAND1##3, OPERAND2##3);
-
-#define ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \
- OPERAND1##4 = OP(OPERAND1##4, OPERAND2##4);
-
-#define ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \
- OPERAND1##5 = OP(OPERAND1##5, OPERAND2##5);
-
-#define ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \
- OPERAND1##6 = OP(OPERAND1##6, OPERAND2##6);
-
-#define ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \
- OPERAND1##7 = OP(OPERAND1##7, OPERAND2##7);
-
-#define ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \
- OPERAND1##8 = OP(OPERAND1##8, OPERAND2##8);
-
-#define ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \
- OPERAND1##9 = OP(OPERAND1##9, OPERAND2##9);
-
-#define ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \
- OPERAND1##A = OP(OPERAND1##A, OPERAND2##A);
-
-#define ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \
- OPERAND1##B = OP(OPERAND1##B, OPERAND2##B);
-
-#define ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \
- OPERAND1##C = OP(OPERAND1##C, OPERAND2##C);
-
-#define ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \
- OPERAND1##D = OP(OPERAND1##D, OPERAND2##D);
-
-#define ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \
- OPERAND1##E = OP(OPERAND1##E, OPERAND2##E);
-
-#define ELTWISE_OP_ROW_16(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \
- OPERAND1##F = OP(OPERAND1##F, OPERAND2##F);
-
-/** @} */ // end of group ELTWISE_OP_ROW_n
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name ELTWISE_OP_BLOCK
- *
- * Supported cases are N=1,2,3,...,16
- *
- * @param[in] OP The elementwise post op
- * @param[in] N The number of vectors in the block
- * @param[in] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_##N(OP, OPERAND1, OPERAND2)
-#define ELTWISE_OP_BLOCK(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2)
-/** @} */ // end of group ELTWISE_OP_BLOCK
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
- * @name ELTWISE_OP_ROW_BROADCAST_n
- *
- * @param[in] OP The elementwise post op
- * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the broadcast operand 2 variables
- * @{
- */
-#define ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \
- OPERAND1##0 = OP(OPERAND1##0, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \
- OPERAND1##1 = OP(OPERAND1##1, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \
- OPERAND1##2 = OP(OPERAND1##2, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \
- OPERAND1##3 = OP(OPERAND1##3, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \
- OPERAND1##4 = OP(OPERAND1##4, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \
- OPERAND1##5 = OP(OPERAND1##5, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \
- OPERAND1##6 = OP(OPERAND1##6, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \
- OPERAND1##7 = OP(OPERAND1##7, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \
- OPERAND1##8 = OP(OPERAND1##8, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \
- OPERAND1##9 = OP(OPERAND1##9, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \
- OPERAND1##A = OP(OPERAND1##A, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \
- OPERAND1##B = OP(OPERAND1##B, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \
- OPERAND1##C = OP(OPERAND1##C, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \
- OPERAND1##D = OP(OPERAND1##D, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \
- OPERAND1##E = OP(OPERAND1##E, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_16(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \
- OPERAND1##F = OP(OPERAND1##F, OPERAND2);
-
-/** @} */ // end of group ELTWISE_OP_ROW_BROADCAST_n
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
- * @name ELTWISE_OP_BLOCK_BROADCAST
- * @note Only support:
- * case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX];
- * case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar);
- * Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1];
- *
- * Supported cases are N=1,2,3,...,16
- *
- * @param[in] OP The elementwise post op
- * @param[in] N The number of vectors in the block
- * @param[in] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_BROADCAST_##N(OP, OPERAND1, OPERAND2)
-#define ELTWISE_OP_BLOCK_BROADCAST(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2)
-/** @} */ // end of group ELTWISE_OP_BLOCK_BROADCAST \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
deleted file mode 100644
index e107f4452d..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h"
-#include "gemm_helpers.h"
-#include "load_store_utility.h"
-
-/** (EXPERIMENTAL_POST_OPS) Convenience macros for automatically handling mixed precision (fp16 and fp32) operations
- * -DMIXED_PRECISION toggles mixed precision mode
- */
-
-/** Mixed-Precision-Aware Activation Block
- * @name MIXED_PRECISION_ACTIVATION_BLOCK
- * params N ... B_VAL: same as those in @ref ACTIVATION_BLOCK
- *
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
- ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME, A_VAL, B_VAL);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
- ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL);
-#endif // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ACTIVATION_BLOCK
-
-/** Mixed-Precision-Aware Elementwise Op Block
- * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name MIXED_PRECISION_ELTWISE_OP_BLOCK
- *
- * @param[in] OP The elementwise post op
- * @param[in] M0 The number of consecutive rows
- * @param[in] N0 The number of consecutive columns
- * @param[in] OPERAND1 The basename of the first and result operand variables
- * @param[in] OPERAND2 The basename of the second operand variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] CONVERTED_OPERAND2 The basename of the second operand variables converted to higher-precision in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
- CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2); \
- ELTWISE_OP_BLOCK(OP, M0, OPERAND1, CONVERTED_OPERAND2);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
- ELTWISE_OP_BLOCK(OP, M0, OPERAND1, OPERAND2);
-#endif // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK
-
-/** Mixed-Precision-Aware Elementwise Op Broadcast Block
- * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
- * @note Only support:
- * case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX]; this means @p N0 > 1
- * case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar) ; this means @p N0 == 1
- * Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1]; this means @p M0 should never == 1
- *
- * @param[in] OP The elementwise post op
- * @param[in] M0 The number of consecutive rows, > 1
- * @param[in] N0 The number of consecutive columns, >= 1
- * @param[in] OPERAND1 The basename of the first and result operand variables
- * @param[in] OPERAND2 The basename of the second operand variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] CONVERTED_OPERAND2 The basename of the second operand variables converted to higher-precision in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
- CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2); \
- ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, CONVERTED_OPERAND2##0);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
- ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, OPERAND2##0);
-#endif // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
-
-/** Mixed-Precision-Aware Boundary-Aware Store Block
- * @name MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE
- * params M0 ... PARTIAL_COND_X, same as those in STORE_BLOCK_BOUNDARY_AWARE
- *
- * @param[in] BASENAME_LP The name of the low precision variables, converted from BASENAME, in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
- CONVERT_BLOCK(M0, N0, DATA_TYPE, BASENAME, BASENAME_LP); \
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME_LP, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
-#endif // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index a32301d8e3..0c30c0e626 100644
--- a/src/core/CL/cl_kernels/common/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -152,7 +152,6 @@
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
* @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
@@ -453,7 +452,6 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -887,7 +885,6 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
* @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
@@ -1213,7 +1210,6 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -1713,7 +1709,6 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
*
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
@@ -1993,7 +1988,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
*
* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
@@ -2380,7 +2374,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
*
* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -2767,7 +2760,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
*
* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
@@ -3226,7 +3218,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS matrix is NOT reshaped
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
*
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
* @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
diff --git a/src/core/experimental/PostOpUtils.h b/src/core/experimental/PostOpUtils.h
deleted file mode 100644
index 6217dcc3da..0000000000
--- a/src/core/experimental/PostOpUtils.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2021, 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
-#define ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
-
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
-
-#include "arm_compute/core/experimental/Types.h"
-#include "support/Cast.h"
-
-#include <vector>
-
-/** (EXPERIMENTAL_POST_OPS) */
-namespace arm_compute
-{
-namespace experimental
-{
-/** Transform a PostOpList of type FromTensorT to one of type ToTensorT */
-template <typename FromTensorT, typename ToTensorT>
-PostOpList<ToTensorT> transform_post_op_list_arguments(const PostOpList<FromTensorT> &post_ops, std::function<ToTensorT(FromTensorT)> transform_arg)
-{
- PostOpList<ToTensorT> transformed_post_ops;
- for(const auto &post_op : post_ops.get_list())
- {
- switch(post_op->type())
- {
- case PostOpType::Activation:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const PostOpAct<FromTensorT> *>(post_op.get());
- transformed_post_ops.template push_back_op<PostOpAct<ToTensorT>>(_post_op->_act_info);
- break;
- }
- case PostOpType::Eltwise_Add:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwiseAdd<FromTensorT> *>(post_op.get());
- transformed_post_ops.template push_back_op<PostOpEltwiseAdd<ToTensorT>>(transform_arg(_post_op->_addend), _post_op->_prev_dst_pos, _post_op->_policy);
- break;
- }
- case PostOpType::Eltwise_PRelu:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwisePRelu<FromTensorT> *>(post_op.get());
- transformed_post_ops.template push_back_op<PostOpEltwisePRelu<ToTensorT>>(transform_arg(_post_op->_alpha_param), _post_op->_prev_dst_pos, _post_op->_policy);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported PostOpType");
- }
- }
- }
- return transformed_post_ops;
-}
-
-/** Get post op argument TensorType from post op argument index in a flattened, ordered post op argument list */
-inline TensorType get_post_op_arg_type(size_t index)
-{
- ARM_COMPUTE_ERROR_ON_MSG(static_cast<int>(index) > EXPERIMENTAL_ACL_POST_OP_ARG_LAST - EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, "Post Op argument index is out of range");
- return static_cast<TensorType>(EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + static_cast<int>(index));
-}
-
-/** Get a sequence of PostOp Types from PostOpList */
-template <typename T>
-PostOpTypeSequence get_post_op_sequence(const PostOpList<T> &post_ops)
-{
- PostOpTypeSequence post_op_sequence;
- for(const auto &op : post_ops.get_list())
- {
- post_op_sequence.push_back(op->type());
- }
- return post_op_sequence;
-}
-
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index d11e4f0b24..39b410d609 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -107,7 +107,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
// Create GEMMInfo structure
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format);
+ false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
// Supported activations in GEMM
const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
@@ -156,8 +156,8 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
_mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
- _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info,
- experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format));
+ _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format,
+ weight_format));
auto mm_mem_req = _mm_gemmlowp->workspace();
for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
@@ -188,7 +188,7 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
// Create GEMMInfo structure
const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format);
+ false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
if(is_quantized)
{
@@ -422,7 +422,7 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo
const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weights_info.weight_format());
+ false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
}
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index de2e9f9742..e4a3d30b6d 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -275,23 +275,14 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
{ "gemm_mm_native", "common/gemm.cl" },
{ "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" },
{ "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" },
- { "gemm_mm_native_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl" },
{ "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
{ "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
{ "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
{ "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
- { "gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
- { "gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
- { "gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
- { "gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
{ "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
{ "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
{ "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
{ "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
- { "gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
- { "gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
- { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
- { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
{ "gemm_lc_vm_f32", "common/gemm.cl" },
{ "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
{ "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
@@ -623,26 +614,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
#include "./cl_kernels/common/gemm_utils.clembed"
},
{
- "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.hembed"
- },
- {
- "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.hembed"
- },
- {
- "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.clembed"
- },
- {
- "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.clembed"
- },
- {
- "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.clembed"
- },
- {
"common/gemmlowp.cl",
#include "./cl_kernels/common/gemmlowp.clembed"
},
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index 5fea097ae3..b8997dfc7f 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
@@ -31,11 +30,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/AccessWindowStatic.h"
#include "src/core/CL/CLUtils.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/float_ops.h"
@@ -52,25 +51,6 @@ namespace
{
using ElementsProcessed = Steps;
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
- // PostOp sequence -> {Kernel Postfix, PostOp Slots}
- { {}, { "", {} } },
- { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
- { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
- { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
- { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
- { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const GEMMKernelInfo &gemm_info)
@@ -90,7 +70,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
"Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
const unsigned int m = gemm_info.m;
const unsigned int n = gemm_info.n;
@@ -133,7 +112,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
}
return Status{};
@@ -240,7 +218,6 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
_reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
_use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
_add_bias = src2 != nullptr;
- _num_post_op_args = gemm_info.post_ops.total_num_arguments();
// In case both input and dst have to be reinterpreted as 3D tensors,
// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -298,20 +275,11 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- // If post_ops are used, then we disable the use of gemm_info.activation_info
- if(gemm_info.post_ops.size() > 0)
- {
- post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
- }
- else
- {
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
- }
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
std::string kernel_name("gemm_mm_native");
- post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
// A macro guard to compile ONLY the kernel of interest
build_opts.add_option("-D" + upper_string(kernel_name));
@@ -396,11 +364,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
unsigned int idx0;
if(_add_bias)
{
- idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (7 + _num_post_op_args);
+ idx0 = 4 * num_arguments_per_2D_tensor() + 7;
}
else
{
- idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (6 + _num_post_op_args);
+ idx0 = 3 * num_arguments_per_2D_tensor() + 6;
}
const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -412,11 +380,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
unsigned int idx0;
if(_add_bias)
{
- idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+ idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0);
}
else
{
- idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+ idx0 = 3 * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0);
}
const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -440,12 +408,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
add_2D_tensor_argument(idx, src2, slice);
}
add_2D_tensor_argument(idx, dst, slice);
- // post op argument buffers
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- add_2D_tensor_argument(idx, post_op_arg, slice);
- }
+
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
if(_add_bias)
@@ -453,12 +416,6 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
}
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
- // post op argument stride_z
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
- }
// Pass m, n and k at runtime
_kernel.setArg<cl_int>(idx++, _m);
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index e478df727a..80f8355932 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
#include "src/core/common/Macros.h"
@@ -76,17 +76,16 @@ public:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
private:
- bool _slide_matrix_b{ true };
- bool _reinterpret_input_as_3d{ false };
- bool _reinterpret_output_as_3d{ false };
- bool _use_dummy_work_items{ false };
- bool _add_bias{ false };
- signed int _m{ 1 };
- signed int _n{ 1 };
- signed int _k{ 1 };
- unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+ bool _slide_matrix_b{ true };
+ bool _reinterpret_input_as_3d{ false };
+ bool _reinterpret_output_as_3d{ false };
+ bool _use_dummy_work_items{ false };
+ bool _add_bias{ false };
+ signed int _m{ 1 };
+ signed int _n{ 1 };
+ signed int _k{ 1 };
};
} // namespace kernels
} // namespace opencl
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index f14a6f1900..d72d29ea1e 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
@@ -31,11 +30,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/CL/CLUtils.h"
#include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/float_ops.h"
@@ -53,25 +52,6 @@ namespace
{
using ElementsProcessed = Steps;
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
- // PostOp sequence -> {Kernel Postfix, PostOp Slots}
- { {}, { "", {} } },
- { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
- { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
- { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
- { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
- { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const GEMMKernelInfo &gemm_info)
@@ -95,7 +75,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
"Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
const unsigned int m = gemm_info.m;
const unsigned int n = gemm_info.n;
@@ -139,7 +118,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
}
return Status{};
@@ -202,7 +180,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
_use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
_add_bias = src2 != nullptr;
_export_to_cl_image = rhs_info.export_to_cl_image;
- _num_post_op_args = gemm_info.post_ops.total_num_arguments();
// Check if we need to slide the matrix B
const unsigned int num_dimensions_src0 = src0->num_dimensions();
@@ -260,23 +237,14 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- // If post_ops are used, then we disable the use of gemm_info.activation_info
- if(gemm_info.post_ops.size() > 0)
- {
- post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
- }
- else
- {
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
- }
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
std::string kernel_name("gemm_mm_reshaped_");
kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
- post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
// A macro guard to compile ONLY the kernel of interest
build_opts.add_option("-D" + upper_string(kernel_name));
@@ -395,13 +363,6 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
// dst buffer
add_2D_tensor_argument(idx, dst, slice);
- // post op argument buffers
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- add_2D_tensor_argument(idx, post_op_arg, slice);
- }
-
// LHS stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
@@ -417,12 +378,6 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
// dst stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
- // post op argument stride_z
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
- }
// Cross-plan padding (if _reinterpret_output_as_3d = true)
if(_reinterpret_output_as_3d)
{
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index 2d668b91a3..8d25412a40 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
#include "src/core/common/Macros.h"
#include "src/gpu/cl/ClCompileContext.h"
@@ -100,17 +100,16 @@ public:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
private:
- bool _slide_matrix_b{ true };
- bool _reinterpret_output_as_3d{ false };
- bool _use_dummy_work_items{ false };
- bool _add_bias{ false };
- bool _export_to_cl_image{ false };
- signed int _m{ 1 };
- signed int _n{ 1 };
- signed int _k{ 1 };
- unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+ bool _slide_matrix_b{ true };
+ bool _reinterpret_output_as_3d{ false };
+ bool _use_dummy_work_items{ false };
+ bool _add_bias{ false };
+ bool _export_to_cl_image{ false };
+ signed int _m{ 1 };
+ signed int _n{ 1 };
+ signed int _k{ 1 };
};
} // namespace kernels
} // namespace opencl
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H */ \ No newline at end of file
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index f780538f53..b34c17cda8 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -23,13 +23,12 @@
*/
#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/CL/CLUtils.h"
#include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/float_ops.h"
@@ -47,25 +46,6 @@ namespace
{
using ElementsProcessed = Steps;
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
- // PostOp sequence -> {Kernel Postfix, PostOp Slots}
- { {}, { "", {} } },
- { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
- { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
- { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
- { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
- { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
{
@@ -86,7 +66,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
"Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
const unsigned int m = gemm_info.m;
const unsigned int n = gemm_info.n;
@@ -132,7 +111,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
}
return Status{};
@@ -203,7 +181,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
_add_bias = src2 != nullptr;
_export_to_cl_image = rhs_info.export_to_cl_image;
_has_pad_y = gemm_info.has_pad_y;
- _num_post_op_args = gemm_info.post_ops.total_num_arguments();
auto padding_info = get_padding_info({ src0, src1, src2, dst });
@@ -270,22 +247,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
}
- // If post_ops are used, then we disable the use of gemm_info.activation_info
- if(gemm_info.post_ops.size() > 0)
- {
- post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
- }
- else
- {
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
- }
+
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
std::string kernel_name("gemm_mm_reshaped_only_rhs_");
kernel_name += rhs_info.transpose ? "t" : "nt";
kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
- post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
// A macro guard to compile ONLY the kernel of interest
build_opts.add_option("-D" + upper_string(kernel_name));
@@ -411,13 +380,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
// dst buffer
add_2D_tensor_argument(idx, dst, slice);
- // post op argument buffers
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- add_2D_tensor_argument(idx, post_op_arg, slice);
- }
-
// LHS stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
@@ -432,12 +394,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
// dst stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
- // post op argument stride_z
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
- }
// Cross-plan padding (if _reinterpret_input_as_3d = true)
if(_reinterpret_input_as_3d && _has_pad_y)
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index 00cdb299ce..471160c94b 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
#include "src/core/common/Macros.h"
#include "src/gpu/cl/ClCompileContext.h"
@@ -90,19 +90,18 @@ public:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
private:
- bool _slide_matrix_b{ true };
- bool _reinterpret_input_as_3d{ false };
- bool _reinterpret_output_as_3d{ false };
- bool _use_dummy_work_items{ false };
- bool _add_bias{ false };
- bool _export_to_cl_image{ false };
- bool _has_pad_y{ false };
- signed int _m{ 1 };
- signed int _n{ 1 };
- signed int _k{ 1 };
- unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+ bool _slide_matrix_b{ true };
+ bool _reinterpret_input_as_3d{ false };
+ bool _reinterpret_output_as_3d{ false };
+ bool _use_dummy_work_items{ false };
+ bool _add_bias{ false };
+ bool _export_to_cl_image{ false };
+ bool _has_pad_y{ false };
+ signed int _m{ 1 };
+ signed int _n{ 1 };
+ signed int _k{ 1 };
};
} // namespace kernels
} // namespace opencl
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index 51248d4a7a..eb9475ccaa 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -90,7 +90,6 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
case ConvolutionMethod::WINOGRAD:
{
ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
- ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
auto f = std::make_unique<ClWinogradConv2d>();
f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math);
_operator = std::move(f);
@@ -99,7 +98,6 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
case ConvolutionMethod::DIRECT:
{
ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
- ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
auto f = std::make_unique<ClDirectConv2d>();
f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
_operator = std::move(f);
@@ -108,7 +106,6 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
case ConvolutionMethod::INDIRECT:
{
ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
- ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
auto f = std::make_unique<ClIndirectConv2d>();
f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
_operator = std::move(f);
@@ -142,7 +139,6 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
{
//Validate Winograd
ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClWinogradConv2d does not support PostOps");
ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math));
break;
}
@@ -150,7 +146,6 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
{
// Validate direct convolution layer
ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClDirectConv2d does not support PostOps");
ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
break;
}
@@ -158,7 +153,6 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
{
// Validate indirect convolution layer
ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClIndirectConv2d does not support PostOps");
ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
break;
}
@@ -271,17 +265,17 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
if(is_data_type_float(src->data_type()))
{
// Get dst shape
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
- const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
- const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8;
- const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16;
- const bool is_ofm_lte_8 = weights->dimension(3U) <= 8;
- const bool is_ofm_lt_64 = weights->dimension(3U) < 64;
- const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
- const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U);
- const bool is_m_one = output_shape[1] * output_shape[2] == 1;
- const bool is_unit_stride = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
- const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h);
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+ const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+ const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8;
+ const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16;
+ const bool is_ofm_lte_8 = weights->dimension(3U) <= 8;
+ const bool is_ofm_lt_64 = weights->dimension(3U) < 64;
+ const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+ const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U);
+ const bool is_m_one = output_shape[1] * output_shape[2] == 1;
+ const bool is_unit_stride = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
+ const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h);
// Run Winograd if valid and IFM >= 8
if(is_wino_valid && is_ifm_ge_8)
@@ -330,7 +324,7 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
{
const bool is_kernel_sz_odd = kernel_sz % 2;
const bool is_g77 = gpu_target == GPUTarget::G77;
- preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
+ preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 ? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
}
// Direct/indirect convolution used for the first layer of the network
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 8db6dabe58..7e331a86f3 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -38,7 +38,6 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
-#include "arm_compute/core/experimental/IPostOp.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/core/utils/helpers/float_ops.h"
@@ -222,7 +221,6 @@ void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorIn
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
// Set the target for the kernels
_mm_native_kernel->set_target(gpu_target);
@@ -254,7 +252,6 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor
kernel_info.reinterpret_input_as_3d = false;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
// Set the target for the kernels
_reshape_lhs_kernel->set_target(gpu_target);
@@ -299,7 +296,6 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
// Set the target for the kernels
_mm_reshaped_only_rhs_kernel->set_target(gpu_target);
@@ -346,7 +342,6 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
// Set the target for the kernels
_mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
@@ -396,7 +391,6 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
@@ -433,7 +427,6 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con
kernel_info.reinterpret_input_as_3d = false;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
@@ -482,7 +475,6 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
@@ -531,7 +523,6 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
@@ -624,7 +615,12 @@ Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
// Select GEMMType
CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
{
- CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,
+ CLScheduler::get().target(),
+ a->data_type(),
+ m,
+ n,
+ k,
+ batch_size,
},
gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
index 682477e4ea..5620471ff9 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.cpp
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,14 +54,14 @@ namespace opencl
{
ClGemmConv2d::ClGemmConv2d()
: _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(),
- _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _use_post_ops(false), _aux_mem(AuxTensorIdx::Count)
+ _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
{
}
ClGemmConv2d::~ClGemmConv2d() = default;
void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
- int gemm_3d_depth, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
+ int gemm_3d_depth, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
@@ -76,14 +76,12 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
false, // fast_math
false, // fp_mixed_precision
true, // broadcast_bias
- act_info, // activation_info
- post_ops // post ops
+ act_info // activation_info
);
TensorInfo tmp_src{ *src };
if(_is_quantized)
{
- ARM_COMPUTE_ERROR_ON_MSG(post_ops.size() > 0, "ClGemmConv2d quantized types do not support post ops");
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
const QuantizationInfo input_quantization_info = src->quantization_info();
@@ -118,7 +116,7 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
}
Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
{
const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
@@ -132,13 +130,11 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
false, // fast_math
false, // fp_mixed_precision
true, // broadcast_bias
- act_info, // activation_info
- post_ops // post ops
+ act_info // activation_info
);
if(is_quantized)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(post_ops.size() > 0, "ClGemmConv2d quantized types do not support post ops");
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
const QuantizationInfo input_quantization_info = src->quantization_info();
@@ -189,19 +185,18 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
// Only for quantize there are few cases where we cannot fuse the activation function in GEMM
_fuse_activation = true;
- _use_post_ops = conv2d_info.post_ops.size() > 0;
const ITensorInfo *gemm_input_to_use = src;
ITensorInfo *gemm_output_to_use = dst;
// Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride();
// Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
src->dimension(idx_height),
kernel_width,
@@ -318,11 +313,10 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info, conv2d_info.post_ops);
+ configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
if(!_skip_col2im)
{
- ARM_COMPUTE_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClGemmConv2d does not support post ops with col2im operation"); // Post ops must be performed after every other op
// Set the GPU target for col2im
_col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
_col2im_kernel->set_target(CLScheduler::get().target());
@@ -334,8 +328,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
"Output shape does not match the expected one");
- // Disable running of activation kernel if post ops are used
- if(!_fuse_activation && !_use_post_ops)
+ if(!_fuse_activation)
{
_activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
_activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
@@ -383,15 +376,11 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1
&& conv2d_info.conv_info.stride().second == 1);
- const bool skip_col2im = data_layout == DataLayout::NHWC;
- bool fuse_activation = true;
- bool use_post_ops = conv2d_info.post_ops.size() > 0;
+ const bool skip_col2im = data_layout == DataLayout::NHWC;
+ bool fuse_activation = true;
ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!skip_im2col
- && conv2d_info.post_ops.size() > 0,
- "ClGemmConv2d does not support post ops with col2im or im2col operation"); // Post ops must be performed after every other op
// Validate biases
if(biases != nullptr)
@@ -520,8 +509,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info,
- conv2d_info.post_ops));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
// Validate Col2Im
if(!skip_col2im)
@@ -530,8 +518,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
}
// Validate Activation Layer
- // Disable running (thus validation) of activation kernel if post ops are used
- if(!fuse_activation && !use_post_ops)
+ if(!fuse_activation)
{
ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
}
@@ -600,8 +587,7 @@ void ClGemmConv2d::run(ITensorPack &tensors)
}
//Run Activation Layer if we cannot fuse in GEMM
- // Disable running of activation kernel if post ops are used
- if(!_fuse_activation && !_use_post_ops)
+ if(!_fuse_activation)
{
ITensorPack pack =
{
@@ -620,7 +606,7 @@ void ClGemmConv2d::prepare(ITensorPack &tensors)
ICLTensor *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- ITensorPack pack =
+ ITensorPack pack =
{
{ TensorType::ACL_SRC, weights },
{ TensorType::ACL_DST, weights_reshaped.get() }
diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
index afde7c511d..8a46ee2dc3 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.h
+++ b/src/gpu/cl/operators/ClGemmConv2d.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,12 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_GEMM_CONV2D_H
-#define ARM_COMPUTE_CL_GEMM_CONV2D_H
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
#include "src/gpu/cl/ClCompileContext.h"
#include "src/gpu/cl/IClOperator.h"
@@ -113,8 +112,8 @@ public:
const WeightsInfo &weights_info = WeightsInfo());
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
experimental::MemoryRequirements workspace() const override;
private:
@@ -133,7 +132,7 @@ private:
*/
void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
- int gemm_3d_depth, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+ int gemm_3d_depth, const ActivationLayerInfo &act_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
*
* @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -150,7 +149,7 @@ private:
* @return a status
*/
static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
- int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+ int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
enum AuxTensorIdx
{
@@ -178,10 +177,9 @@ private:
bool _fuse_activation;
bool _append_bias;
bool _is_prepared;
- bool _use_post_ops;
experimental::MemoryRequirements _aux_mem;
};
} // namespace opencl
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_CONV2D_H */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
diff --git a/src/graph/DataLayerVisitor.cpp b/src/graph/DataLayerVisitor.cpp
index 85d24b4654..073ffd413d 100644
--- a/src/graph/DataLayerVisitor.cpp
+++ b/src/graph/DataLayerVisitor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -131,14 +131,6 @@ void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
add_convolution_layer_method<FusedConvolutionBatchNormalizationNode>(_layer_data, n);
}
-void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
- _layer_data.clear();
- add_generic_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
- add_convolution_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
- add_convolution_layer_method<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-}
-
void DataLayerVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
{
_layer_data.clear();
diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index e5b4adda26..70fe44e134 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018,2021 Arm Limited.
+ * Copyright (c) 2018,2021,2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,7 +37,6 @@ namespace graph
INode::INode()
: _graph(nullptr), _id(EmptyNodeID), _common_params({ "", Target::UNSPECIFIED}),
_outputs(), _input_edges(), _output_edges(), _assigned_target(Target::UNSPECIFIED)
- ,_post_op_info_list(std::list<std::unique_ptr<ConvPostOpInfo>> {})
{
}
// clang-format on
@@ -200,15 +199,5 @@ Target INode::assigned_target() const
{
return _assigned_target;
}
-
-const std::list<std::unique_ptr<ConvPostOpInfo>> &INode::post_op_info_list() const
-{
- return _post_op_info_list;
-}
-
-std::list<std::unique_ptr<ConvPostOpInfo>> &INode::post_op_info_list()
-{
- return _post_op_info_list;
-}
} // namespace graph
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/INodeVisitor.cpp b/src/graph/INodeVisitor.cpp
index f067d618bd..5369f6f539 100644
--- a/src/graph/INodeVisitor.cpp
+++ b/src/graph/INodeVisitor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -85,14 +85,6 @@ void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
{
default_visit(n);
}
-void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
- default_visit(n);
-}
-void DefaultNodeVisitor::visit(FusedConvolutionWithPostOpNode &n)
-{
- default_visit(n);
-}
void DefaultNodeVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
{
default_visit(n);
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index c67f6a538b..882810474e 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -274,8 +274,6 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
case NodeType::FusedConvolutionBatchNormalizationLayer:
return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
- case NodeType::FusedConvolutionWithPostOp:
- return detail::create_fused_convolution_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionWithPostOpNode *>(node), ctx);
case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
case NodeType::GenerateProposalsLayer:
@@ -318,8 +316,6 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
case NodeType::StridedSliceLayer:
return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
- case NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer:
- return detail::create_fused_convolution_batch_normalization_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationWithPostOpsNode *>(node), ctx);
default:
return nullptr;
}
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index c50782db48..8fd8c14f63 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -76,8 +76,6 @@ Status CLNodeValidator::validate(INode *node)
CLDirectConvolutionLayer,
CLGEMMConvolutionLayer,
CLWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
- case NodeType::FusedConvolutionWithPostOp:
- return detail::validate_fused_convolution_with_post_op<CLGEMMConvolutionLayer>(*polymorphic_downcast<FusedConvolutionWithPostOpNode *>(node));
case NodeType::DepthToSpaceLayer:
return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
case NodeType::DepthwiseConvolutionLayer:
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 8eb3e4cb71..38284b93cf 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -29,8 +29,6 @@
#include "arm_compute/graph/Utils.h"
#include "arm_compute/graph/backends/BackendRegistry.h"
#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
#include "arm_compute/graph/nodes/Nodes.h"
#include "src/graph/mutators/MutatorUtils.h"
@@ -333,441 +331,6 @@ void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse
}
}
-/** Check valid combinations:
- *
- * | Main operator | Post operators |
- * |:--------------|:---------------------------|
- * |conv | add |
- * |conv | act + add |
- * |conv | add + act |
- * |conv | act + add + act |
- *
-*/
-#define MAX_VALIDE_COMBINATION 4
-#define MAX_POST_OP_NUM 3
-NodeType valide_post_op_type[MAX_VALIDE_COMBINATION][MAX_POST_OP_NUM] = { { EltwiseLayerNode::node_type },
- { EltwiseLayerNode::node_type, ActivationLayerNode::node_type },
- { ActivationLayerNode::node_type, EltwiseLayerNode::node_type },
- { ActivationLayerNode::node_type, EltwiseLayerNode::node_type, ActivationLayerNode::node_type }
-};
-
-bool check_post_op_type(NodeType *post_op_type, int len)
-{
- if(len > MAX_POST_OP_NUM || len <= 0)
- {
- return false;
- }
-
- bool found = false;
- for(int i = 0; i < MAX_VALIDE_COMBINATION; ++i)
- {
- for(int j = 0; j < len; ++j)
- {
- if(post_op_type[j] != valide_post_op_type[i][j])
- {
- found = false;
- break;
- }
- found = true;
- }
- if(found)
- break;
- }
-
- return found;
-}
-
-void fuse_convolution_with_post_op(Graph &g, INode *fused_node, std::list<INode *> post_op_node_list, int prev_op_dst_pos)
-{
- unsigned int op_idx = 0;
- // Fuse post operators with conv
- for(const auto &post_op : post_op_node_list)
- {
- switch(post_op->type())
- {
- case EltwiseLayerNode::node_type:
- {
- auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op);
- ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
-
- fused_node->post_op_info_list().push_back(std::make_unique<ConvPostOpInfoEltwiseAdd>(prev_op_dst_pos, eltwise_node->convert_policy()));
- ARM_COMPUTE_LOG_GRAPH_VERBOSE(" with Elementwise Layer node with ID : " << post_op->id());
- break;
- }
- case ActivationLayerNode::node_type:
- {
- auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op);
- ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
-
- fused_node->post_op_info_list().push_back(std::make_unique<ConvPostOpInfoActivation>(act_node->activation_info()));
- ARM_COMPUTE_LOG_GRAPH_VERBOSE(" with Activation Layer node with ID : " << post_op->id());
- break;
- }
- default:
- {
- break;
- }
- }
-
- if(op_idx == post_op_node_list.size() - 1) // last fusable node
- {
- transfer_driving_nodes_and_remove_old_node(g, fused_node, post_op, true);
- }
- else
- {
- // Remove node
- g.remove_node(post_op->id());
- }
- op_idx++;
- }
-}
-
-std::list<INode *> get_post_op_list(Graph &g, int &eltwise_operand_id, int &prev_op_dst_pos, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
- std::list<INode *> post_op_node_list = {};
- NodeID prev_op_dst_id = conv_node_id;
- NodeType post_op_type_list[3] = { NodeType::Dummy, NodeType::Dummy, NodeType::Dummy };
- int post_op_idx = 0;
-
- // Get list of the connected nodes
- auto current_node = g.node(conv_node_id);
-
- while(post_op_node_list.size() < 3)
- {
- // This convolution node must have only one output edge, otherwise this function would not have been called
-
- auto current_output_edge_id = current_node->output_edges().begin();
- auto current_output_edge = g.edge(*current_output_edge_id);
- auto post_op_node = current_output_edge->consumer();
-
- bool fusable_post_op = false;
- if(post_op_node != nullptr && post_op_node->output_edges().size() > 0)
- {
- switch(post_op_node->type())
- {
- case EltwiseLayerNode::node_type:
- {
- auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op_node);
- ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
- if(eltwise_node->output(0)->accessor() == nullptr)
- {
- post_op_node_list.push_back(post_op_node);
- fusable_post_op = true;
- post_op_type_list[post_op_idx++] = eltwise_node->type();
-
- // Extract elementwise inputs
- const auto eltwise_input_id_0 = eltwise_node->input_edge(0)->producer_id();
- const auto eltwise_input_id_1 = eltwise_node->input_edge(1)->producer_id();
- if(eltwise_input_id_0 == prev_op_dst_id)
- {
- eltwise_operand_id = eltwise_input_id_1;
- prev_op_dst_pos = 0;
- }
- else if(eltwise_input_id_1 == prev_op_dst_id)
- {
- eltwise_operand_id = eltwise_input_id_0;
- prev_op_dst_pos = 1;
- }
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with elementwise due to the presence of an output accessor\n");
- }
- break;
- }
- case ActivationLayerNode::node_type:
- {
- auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op_node);
- ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
- // Check if activation is supported for fusion
- if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
- {
- break;
- }
- if(act_node->output(0)->accessor() == nullptr)
- {
- post_op_node_list.push_back(post_op_node);
- fusable_post_op = true;
- post_op_type_list[post_op_idx++] = act_node->type();
- prev_op_dst_id = act_node->id();
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
- }
- break;
- }
- default:
- {
- break;
- }
- }
-
- // Check if the node is not a branching node and current node is fusable
- if(post_op_node->output_edges().size() == 1 && fusable_post_op == true)
- {
- current_node = post_op_node;
- }
- else
- {
- break;
- }
- }
- }
-
- // Check whether it's valid post op list
- if(post_op_node_list.size() > 0)
- {
- bool fuse_with_post_op = check_post_op_type(post_op_type_list, post_op_node_list.size());
- if(!fuse_with_post_op)
- {
- post_op_node_list.clear();
- }
- }
-
- return post_op_node_list;
-}
-
-/** Fuse below operators:
- *
- * | Main operator | Post operators |
- * |:--------------|:---------------------------|
- * |conv | add |
- * |conv | act + add |
- * |conv | add + act |
- * |conv | act + add + act |
- *
- * Notes: currently, only GEMM supports fusion with post operator
-*/
-void fuse_convolution_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
- ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
-
- auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
- ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
-
- const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
- if(conv_algorithm != ConvolutionMethod::GEMM)
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
- return;
- }
-
- // Prevent fusion if fused node has an output accessor
- if(conv_node->output(0)->accessor() == nullptr)
- {
- // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
- const Edge *input_edge = conv_node->input_edge(1);
- if(input_edge != nullptr && input_edge->tensor() != nullptr)
- {
- const DataLayout data_layout = input_edge->tensor()->desc().layout;
- const DataType data_type = input_edge->tensor()->desc().data_type;
- const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
- if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
- return;
- }
- }
- else
- {
- return;
- }
-
- // Get post op list
- int eltwise_operand_id = 0;
- int prev_op_dst_pos = 0; // Previous operator dst's postion in current operator
- std::list<INode *> post_op_node_list = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
-
- if(post_op_node_list.size() == 0)
- {
- return;
- }
- else // Do convolution fusion with post op if there're one(elementwise), two or more operators
- {
- const Target assigned_target = conv_node->assigned_target();
-
- // Extract conv inputs
- const auto conv_input_id = conv_node->input_edge(0)->producer_id();
- const auto conv_weights_id = conv_node->input_edge(1)->producer_id();
- const auto conv_info = conv_node->convolution_info();
- const auto conv_method = conv_node->convolution_method();
- const auto num_groups = conv_node->num_groups();
- FastMathHint fast_math_hint = conv_node->fast_math_hint();
-
- // Create the fused node
- const NodeID fused_id = g.add_node<FusedConvolutionWithPostOpNode>(conv_info, num_groups, conv_method, fast_math_hint);
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << conv_node->id());
-
- // Add connections from the conv inputs to the fused node
- g.add_connection(conv_input_id, 0, fused_id, 0);
- g.add_connection(conv_weights_id, 0, fused_id, 1);
- if(conv_node->input_edge(2) != nullptr)
- {
- auto conv_bias_id = conv_node->input_edge(2)->producer_id();
- g.add_connection(conv_bias_id, 0, fused_id, 2);
- }
- // Adding the Element wise operand in case the post op is element wise operation
- auto it = std::find_if(post_op_node_list.begin(),
- post_op_node_list.end(),
- [&](const INode * nd)
- {
- return (nd->type() == graph::NodeType::EltwiseLayer);
- });
-
- if(it != post_op_node_list.end())
- {
- g.add_connection(eltwise_operand_id, 0, fused_id, 3);
- }
- g.remove_node(conv_node->id());
-
- // Update fused node outputs
- auto fused_node = g.node(fused_id);
- fused_node->set_assigned_target(assigned_target);
-
- // Fuse convolution with post op
- fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
-
- post_op_node_list.clear();
- ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
- }
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
- }
-}
-
-void fuse_convolution_batch_normalization_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
- ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
-
- auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(output_edge->producer());
- ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
- const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
- if(conv_algorithm != ConvolutionMethod::GEMM)
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
- return;
- }
-
- // Prevent fusion if fused node has an output accessor
- if(conv_node->output(0)->accessor() == nullptr)
- {
- // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
- const Edge *input_edge = conv_node->input_edge(1);
- if(input_edge != nullptr && input_edge->tensor() != nullptr)
- {
- const DataLayout data_layout = input_edge->tensor()->desc().layout;
- const DataType data_type = input_edge->tensor()->desc().data_type;
- const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
- if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
- return;
- }
- }
- else
- {
- return;
- }
-
- // Get post op list
- int eltwise_operand_id = 0;
- int prev_op_dst_pos = 0; // Previous operator dst's postion in current operator
- std::list<INode *> post_op_node_list = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
-
- if(post_op_node_list.size() == 0)
- {
- return;
- }
- else // Do convolution fusion with post op if there're one(elementwise), two or more operators
- {
- const Target assigned_target = conv_node->assigned_target();
-
- // Extract conv inputs
- const auto conv_input_id = conv_node->input_edge(0)->producer_id();
- const auto conv_weights_id = conv_node->input_edge(1)->producer_id();
- const auto bn_mean_id = conv_node->input_edge(3)->producer_id();
- const auto bn_var_id = conv_node->input_edge(4)->producer_id();
- const auto conv_info = conv_node->convolution_info();
- const auto conv_method = conv_node->convolution_method();
- const auto num_groups = conv_node->num_groups();
- FastMathHint fast_math_hint = conv_node->fast_math_hint();
-
- // Create the fused node
-
- const float epsilon = conv_node->epsilon();
- const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationWithPostOpsNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint);
-
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing FusedConvolutionBatchNormalization node with ID : " << conv_node->id());
-
- // Add connections from the conv inputs to the fused node
- g.add_connection(conv_input_id, 0, fused_id, 0);
- g.add_connection(conv_weights_id, 0, fused_id, 1);
-
- if(conv_node->input_edge(2) != nullptr)
- {
- auto conv_bias_id = conv_node->input_edge(2)->producer_id();
- g.add_connection(conv_bias_id, 0, fused_id, 2);
- }
- g.add_connection(bn_mean_id, 0, fused_id, 3);
- g.add_connection(bn_var_id, 0, fused_id, 4);
-
- // Move connections of old FusedConvolutionBatchNormalization to the fused node
- if(conv_node->input_edge(5) != nullptr)
- {
- const auto bn_beta_id = conv_node->input_edge(5)->producer_id();
- g.add_connection(bn_beta_id, 0, fused_id, 5);
- }
-
- if(conv_node->input_edge(6) != nullptr)
- {
- const auto bn_gamma_id = conv_node->input_edge(6)->producer_id();
- g.add_connection(bn_gamma_id, 0, fused_id, 6);
- }
-
- // Adding the Element wise operand in case the post op is element wise operation
- auto it = std::find_if(post_op_node_list.begin(),
- post_op_node_list.end(),
- [&](const INode * nd)
- {
- return (nd->type() == graph::NodeType::EltwiseLayer);
- });
-
- if(it != post_op_node_list.end())
- {
- g.add_connection(eltwise_operand_id, 0, fused_id, 7);
- }
-
- // Update fused node outputs
- auto fused_node = g.node(fused_id);
- fused_node->set_assigned_target(assigned_target);
-
- auto conv_node_name = conv_node->name();
-
- // collect the post ops names
- std::string post_ops_name = "";
- for(auto &post_op : post_op_node_list)
- {
- post_ops_name += post_op->name();
- }
- fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + post_ops_name, assigned_target });
-
- // Fuse convolution with post op
- fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
-
- post_op_node_list.clear();
- g.remove_node(conv_node->id());
- ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
- }
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
- }
-}
-
template <typename N1, typename F, typename... Args>
void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
{
@@ -839,10 +402,6 @@ void NodeFusionMutator::mutate(Graph &g)
detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
- // The fusion of PostOps to ConvolutionLayer:
- // It must occur after the fusion of PadLayer into ConvolutionLayer
- // It must occur before the fusion of normal ActivationLayer into ConvolutionLayer as it takes precedence
- detail::fuse_layer<ConvolutionLayerNode>(g, cl_target_prec, detail::fuse_convolution_with_post_ops, supported_fused_activations);
detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
@@ -851,7 +410,6 @@ void NodeFusionMutator::mutate(Graph &g)
// The fusion of BatchNormalizationLayer must occur after the fusion of ActivationLayer. Because FusedConvolutionBatchNormalizationNode assumes the BatchNormalization is already fused with activation, if any
detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
- detail::fuse_layer<FusedConvolutionBatchNormalizationNode>(g, cl_target_prec, detail::fuse_convolution_batch_normalization_with_post_ops, supported_fused_activations);
}
} // namespace graph
} // namespace arm_compute
diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
deleted file mode 100644
index af81f0369a..0000000000
--- a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-#include "arm_compute/graph/Utils.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-FusedConvolutionBatchNormalizationWithPostOpsNode::FusedConvolutionBatchNormalizationWithPostOpsNode(float epsilon, PadStrideInfo info,
- unsigned int num_groups,
- ConvolutionMethod method,
- FastMathHint fast_math_hint)
- : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint)
-{
- _input_edges.resize(8, EmptyEdgeID);
- _outputs.resize(1, NullTensorID);
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::set_convolution_method(ConvolutionMethod method)
-{
- _method = method;
-}
-
-float FusedConvolutionBatchNormalizationWithPostOpsNode::epsilon() const
-{
- return _epsilon;
-}
-
-ConvolutionMethod FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_method() const
-{
- return _method;
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::set_fast_math_hint(FastMathHint hint)
-{
- _fast_math_hint = hint;
-}
-
-FastMathHint FusedConvolutionBatchNormalizationWithPostOpsNode::fast_math_hint() const
-{
- return _fast_math_hint;
-}
-
-PadStrideInfo FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_info() const
-{
- return _info;
-}
-
-unsigned int FusedConvolutionBatchNormalizationWithPostOpsNode::num_groups() const
-{
- return _num_groups;
-}
-
-TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
- const TensorDescriptor &weights_descriptor,
- const PadStrideInfo &info)
-{
- unsigned int output_width = 0;
- unsigned int output_height = 0;
-
- const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
- const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
- const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
- const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
-
- std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
-
- const DataLayout data_layout = input_descriptor.layout;
- TensorDescriptor output_descriptor = input_descriptor;
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
-
- return output_descriptor;
-}
-
-bool FusedConvolutionBatchNormalizationWithPostOpsNode::forward_descriptors()
-{
- if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
- {
- Tensor *dst = output(0);
- ARM_COMPUTE_ERROR_ON(dst == nullptr);
- dst->desc() = configure_output(0);
- return true;
- }
- return false;
-}
-
-TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::configure_output(size_t idx) const
-{
- ARM_COMPUTE_UNUSED(idx);
- const Tensor *src = input(0);
- const Tensor *weights = input(1);
-
- ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
-
- TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-
- return output_info;
-}
-
-NodeType FusedConvolutionBatchNormalizationWithPostOpsNode::type() const
-{
- return FusedConvolutionBatchNormalizationWithPostOpsNode::node_type;
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::accept(INodeVisitor &v)
-{
- v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp b/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp
deleted file mode 100644
index 63341e2760..0000000000
--- a/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-#include "arm_compute/graph/Utils.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-FusedConvolutionWithPostOpNode::FusedConvolutionWithPostOpNode(PadStrideInfo info,
- unsigned int num_groups,
- ConvolutionMethod method,
- FastMathHint fast_math_hint,
- QuantizationInfo out_quant_info)
- : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(std::move(out_quant_info)), _fused_activation()
-{
- _input_edges.resize(4, EmptyEdgeID);
- _outputs.resize(1, NullTensorID);
-}
-
-void FusedConvolutionWithPostOpNode::set_convolution_method(ConvolutionMethod method)
-{
- _method = method;
-}
-
-ConvolutionMethod FusedConvolutionWithPostOpNode::convolution_method() const
-{
- return _method;
-}
-
-void FusedConvolutionWithPostOpNode::set_fast_math_hint(FastMathHint hint)
-{
- _fast_math_hint = hint;
-}
-
-FastMathHint FusedConvolutionWithPostOpNode::fast_math_hint() const
-{
- return _fast_math_hint;
-}
-
-PadStrideInfo FusedConvolutionWithPostOpNode::convolution_info() const
-{
- return _info;
-}
-
-unsigned int FusedConvolutionWithPostOpNode::num_groups() const
-{
- return _num_groups;
-}
-
-ActivationLayerInfo FusedConvolutionWithPostOpNode::fused_activation() const
-{
- return _fused_activation;
-}
-
-void FusedConvolutionWithPostOpNode::set_fused_activation(ActivationLayerInfo fused_activation)
-{
- _fused_activation = fused_activation;
-}
-
-void FusedConvolutionWithPostOpNode::set_convolution_info(PadStrideInfo info)
-{
- _info = info;
-}
-
-TensorDescriptor FusedConvolutionWithPostOpNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
- const TensorDescriptor &weights_descriptor,
- const PadStrideInfo &info)
-{
- unsigned int output_width = 0;
- unsigned int output_height = 0;
-
- const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
- const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
- const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
- const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
-
- std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
-
- const DataLayout data_layout = input_descriptor.layout;
- TensorDescriptor output_descriptor = input_descriptor;
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
-
- return output_descriptor;
-}
-
-bool FusedConvolutionWithPostOpNode::forward_descriptors()
-{
- if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
- {
- Tensor *dst = output(0);
- ARM_COMPUTE_ERROR_ON(dst == nullptr);
- dst->desc() = configure_output(0);
- return true;
- }
- return false;
-}
-
-TensorDescriptor FusedConvolutionWithPostOpNode::configure_output(size_t idx) const
-{
- ARM_COMPUTE_UNUSED(idx);
- const Tensor *src = input(0);
- const Tensor *weights = input(1);
-
- ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
-
- TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
- if(!_out_quant_info.empty())
- {
- output_info.quant_info = _out_quant_info;
- }
-
- return output_info;
-}
-
-NodeType FusedConvolutionWithPostOpNode::type() const
-{
- return FusedConvolutionWithPostOpNode::node_type;
-}
-
-void FusedConvolutionWithPostOpNode::accept(INodeVisitor &v)
-{
- v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index 1071d50197..9c7c4248bb 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -85,22 +85,6 @@ void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
_info = ss.str();
}
-void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
- ARM_COMPUTE_UNUSED(n);
- std::stringstream ss;
- ss << "FusedConvolutionBatchNormalizationWithPostOpsNode";
- _info = ss.str();
-}
-
-void DotGraphVisitor::visit(FusedConvolutionWithPostOpNode &n)
-{
- ARM_COMPUTE_UNUSED(n);
- std::stringstream ss;
- ss << "FusedConvolutionWithPostOpNode";
- _info = ss.str();
-}
-
void DotGraphVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
{
ARM_COMPUTE_UNUSED(n);
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 476bf27423..f3c05adb47 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -29,7 +29,6 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
#include "src/core/CL/ICLKernel.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/operators/ClConv2d.h"
@@ -61,26 +60,21 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
CLConvolutionLayer::~CLConvolutionLayer() = default;
void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
}
void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
enable_fast_math, num_groups));
- ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
- // Convert post op arguments to ITensorInfo
- auto transformed_post_ops = experimental::transform_post_op_list_arguments<ICLTensor *, ITensorInfo *>(post_ops, [](auto tensor)
- {
- return tensor->info();
- });
- const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups, transformed_post_ops);
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
weights_info, CLScheduler::get().target()))
@@ -97,7 +91,6 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
}
case ConvolutionMethod::FFT:
{
- ARM_COMPUTE_ERROR_ON_MSG(post_ops.size() > 0, "CLFFTConvolutionLayer does not support post ops");
auto f = std::make_unique<CLFFTConvolutionLayer>(_impl->memory_manager);
f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
_impl->func = std::move(f);
@@ -110,31 +103,23 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
if(_impl->op)
{
- _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
- _impl->aux_mem_req = _impl->op->workspace();
- _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
- size_t post_op_tensor_index = 0;
- for(const auto &op : post_ops.get_list())
- {
- for(auto &tensor : op->arguments())
- {
- _impl->run_pack.add_const_tensor(experimental::get_post_op_arg_type(post_op_tensor_index++), *tensor);
- }
- }
- _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
- _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
+ _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
+ _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
}
Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ITensorInfo *> &post_ops)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
const GPUTarget gpu_target = CLScheduler::get().target();
- const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
{
@@ -149,7 +134,6 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
case ConvolutionMethod::FFT:
{
// Validate FFT-based convolution layer
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(post_ops.size() > 0, "CLFFTConvolutionLayer does not support post ops");
ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math));
break;
}
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index ad5bfd8dd2..c8c18f35db 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,7 +31,6 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/operators/ClGemmConv2d.h"
#include "support/Cast.h"
@@ -69,24 +68,19 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> m
CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+ const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups, post_ops);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
}
void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- _impl->weights = weights;
- _impl->op = std::make_unique<opencl::ClGemmConv2d>();
- // Convert post op arguments to ITensorInfo
- auto transformed_post_ops = experimental::transform_post_op_list_arguments<ICLTensor *, ITensorInfo *>(post_ops, [](auto tensor)
- {
- return tensor->info();
- });
- const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups, transformed_post_ops);
+ _impl->weights = weights;
+ _impl->op = std::make_unique<opencl::ClGemmConv2d>();
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
_impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
_impl->run_pack =
@@ -96,15 +90,6 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
{ TensorType::ACL_SRC_2, biases },
{ TensorType::ACL_DST, output }
};
- // Add post op tensors
- size_t post_op_tensor_index = 0;
- for(const auto &op : post_ops.get_list())
- {
- for(auto &tensor : op->arguments())
- {
- _impl->run_pack.add_const_tensor(experimental::get_post_op_arg_type(post_op_tensor_index++), *tensor);
- }
- }
_impl->prep_pack =
{
{ TensorType::ACL_SRC_1, weights },
@@ -115,9 +100,9 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
}
Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ITensorInfo *> &post_ops)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
- const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups, post_ops);
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
}