aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Sujak <jakub.sujak@arm.com>2023-08-24 14:01:20 +0100
committerJakub Sujak <jakub.sujak@arm.com>2023-09-04 14:41:16 +0000
commit0d27b2ee8d811d66693555ac1e7be44d93e662e2 (patch)
tree8b62a464a8bb9cd46702c8b5a60f3a97e3821b41
parent7ff03b67ba7ce669223f4d807e18fa3efa2f729b (diff)
downloadComputeLibrary-0d27b2ee8d811d66693555ac1e7be44d93e662e2.tar.gz
Remove legacy PostOps code
PostOps was the experimental interface for Dynamic Fusion. It is now replaced by the new Dynamic Fusion interface with code generation using the Compute Kernel Writer. Resolves: COMPMID-6190 Change-Id: I813b48facef2fd6f3aee332588886b4f9b3d33d8 Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10219 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp6
-rw-r--r--SConscript8
-rw-r--r--arm_compute/core/KernelDescriptors.h74
-rw-r--r--arm_compute/core/Types.h51
-rw-r--r--arm_compute/core/experimental/IPostOp.h180
-rw-r--r--arm_compute/core/experimental/PostOps.h163
-rw-r--r--arm_compute/core/experimental/Types.h13
-rw-r--r--arm_compute/function_info/GEMMInfo.h60
-rw-r--r--arm_compute/graph/DataLayerVisitor.h9
-rw-r--r--arm_compute/graph/INode.h33
-rw-r--r--arm_compute/graph/INodeVisitor.h20
-rw-r--r--arm_compute/graph/TypePrinter.h14
-rw-r--r--arm_compute/graph/Types.h74
-rw-r--r--arm_compute/graph/backends/FunctionHelpers.h188
-rw-r--r--arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h136
-rw-r--r--arm_compute/graph/backends/ValidateHelpers.h44
-rw-r--r--arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h127
-rw-r--r--arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h132
-rw-r--r--arm_compute/graph/nodes/Nodes.h10
-rw-r--r--arm_compute/graph/nodes/NodesFwd.h10
-rw-r--r--arm_compute/graph/printers/DotGraphPrinter.h10
-rw-r--r--arm_compute/runtime/CL/functions/CLConvolutionLayer.h19
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h19
-rw-r--r--arm_compute/runtime/FunctionDescriptors.h34
-rw-r--r--docs/user_guide/release_version_and_change_log.dox1
-rwxr-xr-xscripts/format_code.py4
-rw-r--r--src/BUILD.bazel2
-rw-r--r--src/CMakeLists.txt2
-rw-r--r--src/core/CL/CLUtils.cpp114
-rw-r--r--src/core/CL/CLUtils.h91
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h103
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl372
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl1424
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl1399
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h274
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h113
-rw-r--r--src/core/CL/cl_kernels/common/gemm.cl9
-rw-r--r--src/core/experimental/PostOpUtils.h97
-rw-r--r--src/cpu/operators/CpuGemmConv2d.cpp10
-rw-r--r--src/gpu/cl/ClKernelLibrary.cpp29
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp63
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h25
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp55
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h25
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp56
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h29
-rw-r--r--src/gpu/cl/operators/ClConv2d.cpp32
-rw-r--r--src/gpu/cl/operators/ClGemm.cpp16
-rw-r--r--src/gpu/cl/operators/ClGemmConv2d.cpp50
-rw-r--r--src/gpu/cl/operators/ClGemmConv2d.h18
-rw-r--r--src/graph/DataLayerVisitor.cpp10
-rw-r--r--src/graph/INode.cpp15
-rw-r--r--src/graph/INodeVisitor.cpp10
-rw-r--r--src/graph/backends/CL/CLFunctionsFactory.cpp6
-rw-r--r--src/graph/backends/CL/CLNodeValidator.cpp4
-rw-r--r--src/graph/mutators/NodeFusionMutator.cpp442
-rw-r--r--src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp138
-rw-r--r--src/graph/nodes/FusedConvolutionWithPostOpNode.cpp153
-rw-r--r--src/graph/printers/DotGraphPrinter.cpp18
-rw-r--r--src/runtime/CL/functions/CLConvolutionLayer.cpp40
-rw-r--r--src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp33
-rw-r--r--tests/CMakeLists.txt1
-rw-r--r--tests/validation/CL/ConvolutionLayer.cpp291
-rw-r--r--tests/validation/CL/GEMMMatrixMultiplyNative.cpp244
-rw-r--r--tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp421
-rw-r--r--tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp294
-rw-r--r--tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp10
-rw-r--r--tests/validation/fixtures/GEMMFixture.h740
-rw-r--r--tests/validation/reference/PostOps.cpp93
-rw-r--r--tests/validation/reference/PostOps.h47
-rw-r--r--utils/TypePrinter.h152
71 files changed, 360 insertions, 8649 deletions
diff --git a/Android.bp b/Android.bp
index 885786c5af..dabd70ca93 100644
--- a/Android.bp
+++ b/Android.bp
@@ -28,12 +28,6 @@ opencl_srcs = [
"src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl",
"src/core/CL/cl_kernels/common/elementwise_unary.cl",
"src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl",
- "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
- "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
- "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
- "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
- "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h",
- "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
"src/core/CL/cl_kernels/common/fft.cl",
"src/core/CL/cl_kernels/common/fft_digit_reverse.cl",
"src/core/CL/cl_kernels/common/fft_scale.cl",
diff --git a/SConscript b/SConscript
index 4eb3c25142..7bc5affde1 100644
--- a/SConscript
+++ b/SConscript
@@ -222,7 +222,7 @@ def resolve_includes(target, source, env):
found = pattern.search(line)
if found:
# Only get the header file name and discard the relative path.
- # E.g. "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h" -> "fp_mixed_precision_helpers.h"
+ # E.g. "src/core/CL/cl_kernels/activation_float_helpers.h" -> "activation_float_helpers.h"
include_file = found.group(1).split('/')[-1]
data = files_dict[include_file].file_contents
updated_file.extend(data)
@@ -387,9 +387,6 @@ if env['opencl'] and env['embed_kernels']:
'src/core/CL/cl_kernels/tile_helpers.h',
'src/core/CL/cl_kernels/types.h',
'src/core/CL/cl_kernels/warp_helpers.h',
- 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h',
- 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h',
- 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h',
]
# Common kernels
@@ -414,9 +411,6 @@ if env['opencl'] and env['embed_kernels']:
'src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl',
'src/core/CL/cl_kernels/common/elementwise_unary.cl',
'src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl',
- 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl',
- 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl',
- 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl',
'src/core/CL/cl_kernels/common/fft_digit_reverse.cl',
'src/core/CL/cl_kernels/common/fft.cl',
'src/core/CL/cl_kernels/common/fft_scale.cl',
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index 305766e825..2bf5dee18c 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -21,12 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS
-#define ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS
+#ifndef ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
+#define ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
namespace arm_compute
@@ -60,46 +59,43 @@ struct GEMMKernelInfo
{
GEMMKernelInfo() = default;
GEMMKernelInfo(
- unsigned int im,
- unsigned int in,
- unsigned int ik,
- unsigned int idepth_output_gemm3d,
- bool ireinterpret_input_as_3d,
- bool ibroadcast_bias,
- bool ifp_mixed_precision,
- bool ihas_pad_y,
- ActivationLayerInfo iactivation_info,
- int inmult_transpose1xW_width,
- int imult_interleave4x4_height,
- GEMMLHSMatrixInfo ilhs_info,
- GEMMRHSMatrixInfo irhs_info,
- int32_t ina_offset,
- int32_t inb_offset,
- const experimental::PostOpList<ITensorInfo *> &ipost_ops = experimental::PostOpList<ITensorInfo *> {})
+ unsigned int im,
+ unsigned int in,
+ unsigned int ik,
+ unsigned int idepth_output_gemm3d,
+ bool ireinterpret_input_as_3d,
+ bool ibroadcast_bias,
+ bool ifp_mixed_precision,
+ bool ihas_pad_y,
+ ActivationLayerInfo iactivation_info,
+ int inmult_transpose1xW_width,
+ int imult_interleave4x4_height,
+ GEMMLHSMatrixInfo ilhs_info,
+ GEMMRHSMatrixInfo irhs_info,
+ int32_t ina_offset,
+ int32_t inb_offset)
: m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision),
has_pad_y(ihas_pad_y), activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info),
- rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset), post_ops(ipost_ops)
+ rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset)
{
}
- unsigned int m{ 0 }; /**< Number of LHS rows*/
- unsigned int n{ 0 }; /**< Number of RHS columns*/
- unsigned int k{ 0 }; /**< Number of LHS columns or RHS rows */
- unsigned int depth_output_gemm3d{ 0 }; /**< Depth of the output tensor in case is reinterpreted as 3D */
- bool reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
- bool broadcast_bias{ false }; /**< Flag used to broadcast the bias addition */
- bool fp_mixed_precision{ false }; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
- bool has_pad_y{ false }; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
- ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */
- int mult_transpose1xW_width{ 1 }; /**< Multiplication factor for the width of the 1xW transposed block */
- int mult_interleave4x4_height{ 1 }; /**< Multiplication factor for the height of the 4x4 interleaved block */
- GEMMLHSMatrixInfo lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */
- GEMMRHSMatrixInfo rhs_info{}; /**< RHS matrix information used for reshaping the RHS matrix */
- int32_t a_offset{ 0 }; /**< Offset to be added to each element of the matrix A */
- int32_t b_offset{ 0 }; /**< Offset to be added to each element of the matrix B */
- GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */
- experimental::PostOpList<ITensorInfo *> post_ops{}; /**< (EXPERIMENTAL_POST_OPS) Specifies a list of post ops to be fused after the main op. Note unsupported post ops would not be executed.
- * If specified, automatically disable the @ref activation_info */
+ unsigned int m{ 0 }; /**< Number of LHS rows*/
+ unsigned int n{ 0 }; /**< Number of RHS columns*/
+ unsigned int k{ 0 }; /**< Number of LHS columns or RHS rows */
+ unsigned int depth_output_gemm3d{ 0 }; /**< Depth of the output tensor in case is reinterpreted as 3D */
+ bool reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
+ bool broadcast_bias{ false }; /**< Flag used to broadcast the bias addition */
+ bool fp_mixed_precision{ false }; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
+ bool has_pad_y{ false }; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
+ ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */
+ int mult_transpose1xW_width{ 1 }; /**< Multiplication factor for the width of the 1xW transposed block */
+ int mult_interleave4x4_height{ 1 }; /**< Multiplication factor for the height of the 4x4 interleaved block */
+ GEMMLHSMatrixInfo lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */
+ GEMMRHSMatrixInfo rhs_info{}; /**< RHS matrix information used for reshaping the RHS matrix */
+ int32_t a_offset{ 0 }; /**< Offset to be added to each element of the matrix A */
+ int32_t b_offset{ 0 }; /**< Offset to be added to each element of the matrix B */
+ GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */
};
/** Compute descriptor used by the depthwise convolution native kernel */
@@ -240,4 +236,4 @@ struct MatMulKernelInfo
bool export_rhs_to_cl_image{ false }; /**< Flag to know whether the RHS tensor should be exported to cl_image*/
};
} // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS */
+#endif // ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 12d860205e..9264cefe3e 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ACL_ARM_COMPUTE_CORE_TYPES
-#define ACL_ARM_COMPUTE_CORE_TYPES
+#ifndef ACL_ARM_COMPUTE_CORE_TYPES_H
+#define ACL_ARM_COMPUTE_CORE_TYPES_H
/** The following symbols have been moved to:
* half
@@ -65,7 +65,6 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Size3D.h"
#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/experimental/IPostOp.h"
#include "arm_compute/core/utils/misc/Macros.h"
#include "support/Bfloat16.h"
@@ -751,14 +750,14 @@ public:
}
private:
- std::vector<float> _min_sizes;
- std::vector<float> _variances;
- float _offset;
- bool _flip;
- bool _clip;
- std::vector<float> _max_sizes;
- std::vector<float> _aspect_ratios;
- Coordinates2D _img_size;
+ std::vector<float> _min_sizes;
+ std::vector<float> _variances;
+ float _offset;
+ bool _flip;
+ bool _clip;
+ std::vector<float> _max_sizes;
+ std::vector<float> _aspect_ratios;
+ Coordinates2D _img_size;
std::array<float, 2> _steps;
};
@@ -1003,15 +1002,15 @@ public:
}
private:
- unsigned int _max_detections;
- unsigned int _max_classes_per_detection;
- float _nms_score_threshold;
- float _iou_threshold;
- unsigned int _num_classes;
+ unsigned int _max_detections;
+ unsigned int _max_classes_per_detection;
+ float _nms_score_threshold;
+ float _iou_threshold;
+ unsigned int _num_classes;
std::array<float, 4> _scales_values;
- bool _use_regular_nms;
- unsigned int _detection_per_class;
- bool _dequantize_scores;
+ bool _use_regular_nms;
+ unsigned int _detection_per_class;
+ bool _dequantize_scores;
};
/** Pooling Layer Information struct*/
@@ -1462,13 +1461,13 @@ public:
}
private:
- float _img_width;
- float _img_height;
- float _scale;
- bool _apply_scale;
- bool _correct_transform_coords;
+ float _img_width;
+ float _img_height;
+ float _scale;
+ bool _apply_scale;
+ bool _correct_transform_coords;
std::array<float, 4> _weights;
- float _bbox_xform_clip;
+ float _bbox_xform_clip;
};
/** Normalization Layer Information class */
@@ -1915,4 +1914,4 @@ struct IOFormatInfo
/** Class for holding information related to cropping */
using CropInfo = Padding2D;
} // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_CORE_TYPES */
+#endif // ACL_ARM_COMPUTE_CORE_TYPES_H
diff --git a/arm_compute/core/experimental/IPostOp.h b/arm_compute/core/experimental/IPostOp.h
deleted file mode 100644
index 567a4023c0..0000000000
--- a/arm_compute/core/experimental/IPostOp.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
-#define ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
-
-#include <memory>
-#include <numeric>
-#include <vector>
-
-namespace arm_compute
-{
-namespace experimental
-{
-/** Type of Post Op */
-enum class PostOpType
-{
- Activation,
- Eltwise_Add,
- Eltwise_PRelu
-};
-/** An ordered sequence of type of Post Ops */
-using PostOpTypeSequence = std::vector<PostOpType>;
-/** An elementwise n-ary operation that can be appended to and fused with (at kernel-level) other operators
- * It contains:
- * 1. The attributes of the original operator.
- * 2. Any additional tensor argument.
- * 3. The position of the previous op's dst tensor in its argument list ( @ref prev_dst_pos )
- *
- * For example, a series of chained ops:
- *
- * div(src1, relu(conv(src0, weights, bias, conv_info), act_info), div_info)
- *
- * translates to
- *
- * dst = conv(src0, weights, bias, conv_info) // main op
- * dst = relu(dst, act_info) // previous dst is placed in the first (and only) argument
- * dst = div(src1, dst, div_info) // previous dst is placed in the second argument
- *
- * which in turn translates to:
- *
- * main op: conv(src0, weights, bias, conv_info)
- * post op1: relu(act_info, prev_dst_pos = 0)
- * post op2: div(div_info, src1, prev_dst_pos = 1)
- *
- * @note: On Broadcasting
- * For n-ary post ops, the tensor arguments must not "widen" the dst tensor of the main op
- * For example, for a dst of shape [14, 1, 34]:
- * * post_op_arg1 = [1, 1, 34] is allowed: broadcast in dim 0
- * * post_op_arg1 = [14, 1, 34] is allowed: no broadcast
- * * post_op_arg1 = [1, 1, 34] is allowed: broadcast in dims 0 and 1
- * * post_op_arg1 = [14, 15, 34] is NOT allowed: broadcast widens the dst tensor
- *
- * @note: On Data layout
- * All post ops are data layout agnostic. This means post ops do not have an inherent idea of "width", "height" and so on.
- * Should we want to perform a post op with 2 tensors of different data layouts (where data layouts are significant to both),
- * then we need to perform necessary permutation op beforehand to unify their data layout before they can be fused with a post op
- *
- * Note although post ops themselves should be able to support any data layout, the main op they fuse to may impose
- * additional restrictions in the presence of post ops. For example, the implementation of a gemm op may only allow
- * NHWC data layout if post ops are provided. Such restrictions are main op implementation specific.
- *
- * @note: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type
- * @note: If TensorRelatedT points to a resource, IPostOp assumes that resource is valid throughout its lifetime
- * and the lifetime of its copies. This is almost guaranteed as IPostOp is only meant to be used at configure time
- * after the ITensor or ITensorInfo objects are already constructed
- */
-template <typename TensorRelatedT>
-struct IPostOp
-{
- /** Get the arity of the post op
- * @note: that this is one fewer than the arity of the original op, because we implicitly pass the previous op's dst
- * tensor as one of the arguments
- */
- size_t arity() const
- {
- return arguments().size();
- }
- /** The position of previous op's dst in current op's argument list */
- virtual int prev_dst_pos() const = 0;
- /** The IPostOp type */
- virtual PostOpType type() const = 0;
- /** The argument tensors
- * The order of the argument tensor is strictly preserved
- */
- virtual std::vector<TensorRelatedT *> arguments() = 0;
- virtual std::vector<const TensorRelatedT *> arguments() const = 0;
- /** Clone method used in cases where PostOps are owned by unique_ptr
- * @note: This performs a shallow copy of the TensorRelatedT if TensorRelatedT points to a resource
- */
- virtual std::unique_ptr<IPostOp<TensorRelatedT>> clone() const = 0;
- virtual ~IPostOp()
- {
- }
-};
-
-/** A sequence of PostOps that can be appended to the end of other operators */
-template <typename TensorRelatedT>
-class PostOpList
-{
-public:
- /** Constructor */
- PostOpList() = default;
- /** Destructor */
- ~PostOpList() = default;
- PostOpList(const PostOpList &other)
- {
- for(const auto &op : other._post_ops)
- {
- this->_post_ops.push_back(op->clone());
- }
- }
- PostOpList &operator=(const PostOpList &other)
- {
- PostOpList tmp{ other };
- std::swap(tmp, *this);
- return *this;
- }
- PostOpList(PostOpList &&other) = default;
- PostOpList &operator=(PostOpList &&other) = default;
-
- /** Add a new post op at the end of the list */
- template <typename OpT, typename... Args>
- void push_back_op(Args &&... args)
- {
- _post_ops.push_back(std::make_unique<OpT>(std::forward<Args>(args)...));
- }
-
- /** Number of post ops */
- size_t size() const
- {
- return _post_ops.size();
- }
-
- /** Total number of post ops */
- size_t total_num_arguments() const
- {
- return std::accumulate(_post_ops.begin(), _post_ops.end(), 0, [](size_t op1_arity, const auto & op2)
- {
- return op1_arity + op2->arity();
- });
- }
-
- /** Get the underlying post op list */
- std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list()
- {
- return _post_ops;
- }
- const std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list() const
- {
- return _post_ops;
- }
-
-private:
- std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> _post_ops{};
-};
-
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
diff --git a/arm_compute/core/experimental/PostOps.h b/arm_compute/core/experimental/PostOps.h
deleted file mode 100644
index a5585bab5d..0000000000
--- a/arm_compute/core/experimental/PostOps.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2021, 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_POSTOPS
-#define ARM_COMPUTE_EXPERIMENTAL_POSTOPS
-
-#include "arm_compute/core/experimental/IPostOp.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/function_info/ActivationLayerInfo.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-namespace experimental
-{
-/** (EXPERIMENTAL_POST_OPS)
- * Implementation of specific IPostOps
-*/
-
-template <typename TensorRelatedT>
-struct PostOpAct : public IPostOp<TensorRelatedT>
-{
-public:
- PostOpAct(const ActivationLayerInfo &act_info)
- : _act_info{ act_info }
- {
- }
- // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
- ~PostOpAct() override = default;
- PostOpAct(const PostOpAct &) = default;
- PostOpAct &operator=(const PostOpAct &) = default;
- PostOpAct(PostOpAct &&) = default;
- PostOpAct &operator=(PostOpAct &&) = default;
-
- int prev_dst_pos() const override
- {
- return 0;
- }
- PostOpType type() const override
- {
- return PostOpType::Activation;
- }
- std::vector<TensorRelatedT *> arguments() override
- {
- return {};
- }
- std::vector<const TensorRelatedT *> arguments() const override
- {
- return {};
- }
- std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
- {
- return std::make_unique<PostOpAct<TensorRelatedT>>(*this);
- }
- ActivationLayerInfo _act_info;
-};
-
-template <typename TensorRelatedT>
-struct PostOpEltwiseAdd : public IPostOp<TensorRelatedT>
-{
-public:
- PostOpEltwiseAdd(TensorRelatedT addend, int prev_dst_pos, ConvertPolicy policy)
- : _addend{ addend },
- _prev_dst_pos{ prev_dst_pos },
- _policy{ policy }
- {
- }
- // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
- ~PostOpEltwiseAdd() override = default;
- PostOpEltwiseAdd(const PostOpEltwiseAdd &) = default;
- PostOpEltwiseAdd &operator=(const PostOpEltwiseAdd &) = default;
- PostOpEltwiseAdd(PostOpEltwiseAdd &&) = default;
- PostOpEltwiseAdd &operator=(PostOpEltwiseAdd &&) = default;
- int prev_dst_pos() const override
- {
- return _prev_dst_pos;
- }
- PostOpType type() const override
- {
- return PostOpType::Eltwise_Add;
- }
- std::vector<TensorRelatedT *> arguments() override
- {
- return { &_addend };
- }
- std::vector<const TensorRelatedT *> arguments() const override
- {
- return { &_addend };
- }
- std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
- {
- return std::make_unique<PostOpEltwiseAdd<TensorRelatedT>>(*this);
- }
- TensorRelatedT _addend;
- int _prev_dst_pos;
- ConvertPolicy _policy;
-};
-
-template <typename TensorRelatedT>
-struct PostOpEltwisePRelu : public IPostOp<TensorRelatedT>
-{
-public:
- PostOpEltwisePRelu(TensorRelatedT alpha_param, int prev_dst_pos, ConvertPolicy policy)
- : _alpha_param{ alpha_param },
- _prev_dst_pos{ prev_dst_pos },
- _policy{ policy }
- {
- }
- // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
- ~PostOpEltwisePRelu() override = default;
- PostOpEltwisePRelu(const PostOpEltwisePRelu &) = default;
- PostOpEltwisePRelu &operator=(const PostOpEltwisePRelu &) = default;
- PostOpEltwisePRelu(PostOpEltwisePRelu &&) = default;
- PostOpEltwisePRelu &operator=(PostOpEltwisePRelu &&) = default;
- int prev_dst_pos() const override
- {
- return _prev_dst_pos;
- }
- PostOpType type() const override
- {
- return PostOpType::Eltwise_PRelu;
- }
- std::vector<TensorRelatedT *> arguments() override
- {
- return { &_alpha_param };
- }
- std::vector<const TensorRelatedT *> arguments() const override
- {
- return { &_alpha_param };
- }
- std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
- {
- return std::make_unique<PostOpEltwisePRelu<TensorRelatedT>>(*this);
- }
- TensorRelatedT _alpha_param;
- int _prev_dst_pos;
- ConvertPolicy _policy;
-};
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_POSTOPS
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
index 1995ab045e..8dd6812b58 100644
--- a/arm_compute/core/experimental/Types.h
+++ b/arm_compute/core/experimental/Types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_EXPERIMENTAL_TYPES_H
-#define ARM_COMPUTE_EXPERIMENTAL_TYPES_H
+#ifndef ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
+#define ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/TensorShape.h"
@@ -78,11 +78,6 @@ enum TensorType : int32_t
ACL_VEC_COL_SUM = ACL_SRC_4,
ACL_SHIFTS = ACL_SRC_5,
ACL_MULTIPLIERS = ACL_SRC_6,
-
- // (EXPERIMENTAL_POST_OPS) Post ops arguments begin after everything else
- EXPERIMENTAL_ACL_POST_OP_ARG = 2048,
- EXPERIMENTAL_ACL_POST_OP_ARG_FIRST = EXPERIMENTAL_ACL_POST_OP_ARG,
- EXPERIMENTAL_ACL_POST_OP_ARG_LAST = EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + 1024, // Max number of post op arguments
};
namespace experimental
@@ -134,4 +129,4 @@ struct MemoryInfo
using MemoryRequirements = std::vector<MemoryInfo>;
} // namespace experimental
} // namespace arm_compute
-#endif /* ARM_COMPUTE_EXPERIMENTAL_TYPES_H */
+#endif // ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
diff --git a/arm_compute/function_info/GEMMInfo.h b/arm_compute/function_info/GEMMInfo.h
index daaf86243a..29a57a00c2 100644
--- a/arm_compute/function_info/GEMMInfo.h
+++ b/arm_compute/function_info/GEMMInfo.h
@@ -21,11 +21,10 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO
-#define ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
#include "arm_compute/core/CoreTypes.h"
-#include "arm_compute/core/experimental/IPostOp.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
#include <vector>
@@ -79,7 +78,6 @@ public:
_pretranspose_A(false),
_pretranspose_B(false),
_activation_info(),
- _post_ops(),
_fixed_format(false),
_weight_format(arm_compute::WeightFormat::UNSPECIFIED)
{
@@ -99,14 +97,12 @@ public:
* @param[in] fast_math (Optional) Use a data type of shorter width to improve performance
* @param[in] broadcast_bias (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
* @param[in] activation_info (Optional) Activation to apply after the matrix multiplication
- * @param[in] post_ops (Optional) A sequence of post operations that are performed after the main operation.
* @param[in] fixed_format (Optional) Specify the selection of fixed format kernels for variable weights support in GEMM. These kernels expect the weights tensor to be in amemory format that is fixed by the kernel itself. For more information, see arm_compute::WeightFormat.
* @param[in] weight_format (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
*/
GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false,
GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool fast_math = false, bool broadcast_bias = false,
- const ActivationLayerInfo &activation_info = ActivationLayerInfo(), const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *>(),
- bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED) noexcept
+ const ActivationLayerInfo &activation_info = ActivationLayerInfo(), bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED) noexcept
: _is_a_reshaped(is_a_reshaped),
_is_b_reshaped(is_b_reshaped),
_reshape_b_only_on_first_run(reshape_b_only_on_first_run),
@@ -120,7 +116,6 @@ public:
_pretranspose_A(false),
_pretranspose_B(false),
_activation_info(activation_info),
- _post_ops(post_ops),
_fixed_format(fixed_format),
_weight_format(weight_format)
{
@@ -271,22 +266,6 @@ public:
{
_activation_info = activation_info;
}
- /** Post operations to apply after the matrix multiplication
- *
- * @return experimental::PostOpList object
- */
- const experimental::PostOpList<ITensorInfo *> &post_ops() const
- {
- return _post_ops;
- }
- /** Set post ops
- *
- * @param[in] post_ops experimental::PostOpList object to set
- */
- void set_post_ops(const experimental::PostOpList<ITensorInfo *> &post_ops)
- {
- _post_ops = post_ops;
- }
/** Flag which specifies if the GEMM operation is running fixed-format kernels.
*
* @return True if the GEMM operation is running fixed-format kernel else false.
@@ -320,22 +299,21 @@ public:
}
private:
- bool _is_a_reshaped;
- bool _is_b_reshaped;
- bool _reshape_b_only_on_first_run;
- int _depth_output_gemm3d;
- bool _reinterpret_input_as_3d;
- bool _retain_internal_weights;
- GEMMLowpOutputStageInfo _gemmlowp_output_stage;
- bool _fast_math;
- bool _fp_mixed_precision;
- bool _broadcast_bias;
- bool _pretranspose_A;
- bool _pretranspose_B;
- ActivationLayerInfo _activation_info;
- experimental::PostOpList<ITensorInfo *> _post_ops;
- bool _fixed_format;
- arm_compute::WeightFormat _weight_format;
+ bool _is_a_reshaped;
+ bool _is_b_reshaped;
+ bool _reshape_b_only_on_first_run;
+ int _depth_output_gemm3d;
+ bool _reinterpret_input_as_3d;
+ bool _retain_internal_weights;
+ GEMMLowpOutputStageInfo _gemmlowp_output_stage;
+ bool _fast_math;
+ bool _fp_mixed_precision;
+ bool _broadcast_bias;
+ bool _pretranspose_A;
+ bool _pretranspose_B;
+ ActivationLayerInfo _activation_info;
+ bool _fixed_format;
+ arm_compute::WeightFormat _weight_format;
};
} //namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO */
+#endif // ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
diff --git a/arm_compute/graph/DataLayerVisitor.h b/arm_compute/graph/DataLayerVisitor.h
index ac7f1c84ee..11d9f1ddc9 100644
--- a/arm_compute/graph/DataLayerVisitor.h
+++ b/arm_compute/graph/DataLayerVisitor.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GRAPH_DATALAYERPRINTER_H
-#define ARM_COMPUTE_GRAPH_DATALAYERPRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
+#define ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
#include "arm_compute/graph/IGraphPrinter.h"
#include "arm_compute/graph/INodeVisitor.h"
@@ -48,7 +48,6 @@ public:
void visit(ConvolutionLayerNode &n) override;
void visit(DepthwiseConvolutionLayerNode &n) override;
void visit(FusedConvolutionBatchNormalizationNode &n) override;
- void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
void visit(OutputNode &n) override;
@@ -59,4 +58,4 @@ private:
};
} // namespace graph
} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DATALAYERPRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
diff --git a/arm_compute/graph/INode.h b/arm_compute/graph/INode.h
index becd672d90..5646ea81d2 100644
--- a/arm_compute/graph/INode.h
+++ b/arm_compute/graph/INode.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019,2021 Arm Limited.
+ * Copyright (c) 2018-2019,2021,2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GRAPH_INODE_H
-#define ARM_COMPUTE_GRAPH_INODE_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_INODE_H
+#define ACL_ARM_COMPUTE_GRAPH_INODE_H
#include "arm_compute/core/Error.h"
#include "arm_compute/graph/LayerDescriptors.h"
@@ -241,30 +241,19 @@ public:
* @return Assigned target of this node
*/
Target assigned_target() const;
- /** Post operator info list
- *
- * @return Post operator info list
- */
- const std::list<std::unique_ptr<ConvPostOpInfo>> &post_op_info_list() const;
- /** Post operator info list
- *
- * @return Post operator info list
- */
- std::list<std::unique_ptr<ConvPostOpInfo>> &post_op_info_list();
protected:
friend class Graph;
protected:
- Graph *_graph; /**< Backward reference to graph owning the node */
- NodeID _id; /**< Node ID */
- NodeParams _common_params; /**< Node common params */
- std::vector<TensorID> _outputs; /**< Output of the node */
- std::vector<EdgeID> _input_edges; /**< Inputs edge set */
- std::set<EdgeID> _output_edges; /**< Output edge set */
- Target _assigned_target; /**< Assigned target by the Graph executor */
- std::list<std::unique_ptr<ConvPostOpInfo>> _post_op_info_list; /**< Post operator info list */
+ Graph *_graph; /**< Backward reference to graph owning the node */
+ NodeID _id; /**< Node ID */
+ NodeParams _common_params; /**< Node common params */
+ std::vector<TensorID> _outputs; /**< Output of the node */
+ std::vector<EdgeID> _input_edges; /**< Inputs edge set */
+ std::set<EdgeID> _output_edges; /**< Output edge set */
+ Target _assigned_target; /**< Assigned target by the Graph executor */
};
} // namespace graph
} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_INODE_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_INODE_H
diff --git a/arm_compute/graph/INodeVisitor.h b/arm_compute/graph/INodeVisitor.h
index 97e95336ef..efe191adfc 100644
--- a/arm_compute/graph/INodeVisitor.h
+++ b/arm_compute/graph/INodeVisitor.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GRAPH_INODEVISITOR_H
-#define ARM_COMPUTE_GRAPH_INODEVISITOR_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
+#define ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
#include "arm_compute/graph/nodes/NodesFwd.h"
@@ -106,16 +106,6 @@ public:
* @param[in] n Node to visit.
*/
virtual void visit(FusedConvolutionBatchNormalizationNode &n) = 0;
- /** Visit FusedConvolutionBatchNormalizationWithPostOpsNode.
- *
- * @param[in] n Node to visit.
- */
- virtual void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) = 0;
- /** Visit FusedConvolutionWithPostOpNode.
- *
- * @param[in] n Node to visit.
- */
- virtual void visit(FusedConvolutionWithPostOpNode &n) = 0;
/** Visit FusedDepthwiseConvolutionBatchNormalizationNode.
*
* @param[in] n Node to visit.
@@ -215,8 +205,6 @@ public:
virtual void visit(FlattenLayerNode &n) override;
virtual void visit(FullyConnectedLayerNode &n) override;
virtual void visit(FusedConvolutionBatchNormalizationNode &n) override;
- virtual void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
- virtual void visit(FusedConvolutionWithPostOpNode &n) override;
virtual void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
virtual void visit(InputNode &n) override;
virtual void visit(NormalizationLayerNode &n) override;
@@ -240,4 +228,4 @@ public:
};
} // namespace graph
} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_INODEVISITOR_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
diff --git a/arm_compute/graph/TypePrinter.h b/arm_compute/graph/TypePrinter.h
index 8f97bbf845..9df4eba5ec 100644
--- a/arm_compute/graph/TypePrinter.h
+++ b/arm_compute/graph/TypePrinter.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GRAPH_TYPE_PRINTER_H
-#define ARM_COMPUTE_GRAPH_TYPE_PRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
+#define ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Types.h"
@@ -116,12 +116,6 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
case NodeType::FusedConvolutionBatchNormalizationLayer:
os << "FusedConvolutionBatchNormalizationLayer";
break;
- case NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer:
- os << "FusedConvolutionBatchNormalizationLayerWithPostOpsLayer";
- break;
- case NodeType::FusedConvolutionWithPostOp:
- os << "FusedConvolutionWithPostOp";
- break;
case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
os << "FusedDepthwiseConvolutionBatchNormalizationLayer";
break;
@@ -295,4 +289,4 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DepthwiseConvolution
}
} // namespace graph
} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_TYPE_PRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index 167f7388d4..8d493403b3 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GRAPH_TYPES_H
-#define ARM_COMPUTE_GRAPH_TYPES_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_TYPES_H
+#define ACL_ARM_COMPUTE_GRAPH_TYPES_H
#include "arm_compute/core/Error.h"
#include "arm_compute/core/PixelValue.h"
@@ -41,32 +41,31 @@ namespace arm_compute
{
namespace graph
{
-using arm_compute::CLTunerMode;
using arm_compute::CLBackendType;
+using arm_compute::CLTunerMode;
using arm_compute::Status;
using arm_compute::Coordinates;
-using arm_compute::DataType;
using arm_compute::DataLayout;
using arm_compute::DataLayoutDimension;
-using arm_compute::TensorShape;
-using arm_compute::Size2D;
+using arm_compute::DataType;
using arm_compute::PermutationVector;
using arm_compute::PixelValue;
+using arm_compute::Size2D;
+using arm_compute::TensorShape;
using arm_compute::ActivationLayerInfo;
using arm_compute::DetectionOutputLayerInfo;
using arm_compute::DetectionPostProcessLayerInfo;
-using arm_compute::NormType;
-using arm_compute::NormalizationLayerInfo;
+using arm_compute::DimensionRoundingType;
using arm_compute::FullyConnectedLayerInfo;
+using arm_compute::InterpolationPolicy;
+using arm_compute::NormalizationLayerInfo;
+using arm_compute::NormType;
using arm_compute::PadStrideInfo;
using arm_compute::PoolingLayerInfo;
using arm_compute::PoolingType;
using arm_compute::PriorBoxLayerInfo;
-using arm_compute::DimensionRoundingType;
-using arm_compute::InterpolationPolicy;
-using arm_compute::experimental::PostOpType;
using GraphID = unsigned int;
using TensorID = unsigned int;
@@ -150,55 +149,6 @@ enum class FastMathHint
Disabled, /**< Fast math disabled for Convolution layer */
};
-/** Convolution post operator info */
-class ConvPostOpInfo
-{
-public:
- /** Returns post op type
- *
- * @return Post op type
- */
- virtual PostOpType type() const = 0;
- virtual ~ConvPostOpInfo()
- {
- }
-};
-
-class ConvPostOpInfoActivation : public ConvPostOpInfo
-{
-public:
- ConvPostOpInfoActivation(const ActivationLayerInfo &act)
- : _act(act)
- {
- }
- ~ConvPostOpInfoActivation() override
- {
- }
- PostOpType type() const override
- {
- return PostOpType::Activation;
- }
- ActivationLayerInfo _act;
-};
-
-class ConvPostOpInfoEltwiseAdd : public ConvPostOpInfo
-{
-public:
- ConvPostOpInfoEltwiseAdd(int arg_pos, const ConvertPolicy &policy)
- : _prev_op_dst_pos(arg_pos), _policy(policy)
- {
- }
- PostOpType type() const override
- {
- return PostOpType::Eltwise_Add;
- }
- ~ConvPostOpInfoEltwiseAdd() override
- {
- }
- int _prev_op_dst_pos;
- ConvertPolicy _policy;
-};
-
/** Supported nodes */
enum class NodeType
{
@@ -219,8 +169,6 @@ enum class NodeType
FlattenLayer,
FullyConnectedLayer,
FusedConvolutionBatchNormalizationLayer,
- FusedConvolutionWithPostOp,
- FusedConvolutionBatchNormalizationLayerWithPostOpsLayer,
FusedDepthwiseConvolutionBatchNormalizationLayer,
GenerateProposalsLayer,
L2NormalizeLayer,
@@ -278,4 +226,4 @@ struct NodeParams
};
} // namespace graph
} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_TYPES_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_TYPES_H
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 803283e20d..a567427bf1 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,18 +21,15 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
+#define ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
#include "arm_compute/graph/Logger.h"
#include "arm_compute/graph/Tensor.h"
#include "arm_compute/graph/TypePrinter.h"
#include "arm_compute/graph/Types.h"
#include "arm_compute/graph/Utils.h"
#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h"
-#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h"
#include "arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h"
#include "arm_compute/graph/backends/Utils.h"
#include "arm_compute/graph/nodes/Nodes.h"
@@ -541,183 +538,6 @@ std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node,
return std::move(func);
}
-/** Create a backend convolution layer function with post operator
- *
- * @tparam ConvolutionLayerFunctions Backend convolution functions
- * @tparam TargetInfo Target-specific information
- *
- * @param[in] node Node to create the backend function for
- * @param[in] ctx Graph context
- *
- * @return Backend convolution layer function
- */
-template <typename ConvolutionLayerFunctions, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_convolution_with_post_op(FusedConvolutionWithPostOpNode &node, GraphContext &ctx)
-{
- validate_node<TargetInfo>(node, 4 /* expected inputs */, 1 /* expected outputs */);
-
- // Extract IO and info
- typename TargetInfo::TensorType *input = get_backing_tensor<TargetInfo>(node.input(0));
- typename TargetInfo::TensorType *weights = get_backing_tensor<TargetInfo>(node.input(1));
- typename TargetInfo::TensorType *biases = get_backing_tensor<TargetInfo>(node.input(2));
- typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
-
- const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
- if(is_quantized)
- {
- biases->info()->set_data_type(DataType::S32);
- }
-
- const PadStrideInfo conv_info = node.convolution_info();
- const unsigned int num_groups = node.num_groups();
- const ActivationLayerInfo fused_act = node.fused_activation();
-
- experimental::PostOpList<typename TargetInfo::TensorType *> post_ops;
-
- auto &post_op_info_list = node.post_op_info_list();
- for(const auto &post_op_info : post_op_info_list)
- {
- switch(post_op_info->type())
- {
- case PostOpType::Activation:
- {
- const auto act_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoActivation *>(post_op_info.get());
- post_ops.template push_back_op<experimental::PostOpAct<typename TargetInfo::TensorType *>>(act_info->_act);
- break;
- }
- case PostOpType::Eltwise_Add:
- {
- typename TargetInfo::TensorType *add_input = get_backing_tensor<TargetInfo>(node.input(3));
- const auto eltwise_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoEltwiseAdd *>(post_op_info.get());
- post_ops.template push_back_op<experimental::PostOpEltwiseAdd<typename TargetInfo::TensorType *>>(add_input, eltwise_info->_prev_op_dst_pos, eltwise_info->_policy);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported PostOpType");
- }
- }
- }
-
- // Create and configure function (we assume that functions have been validated before creation)
- std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, TargetInfo::TargetType);
- std::unique_ptr<IFunction> func;
- std::string func_name;
-
- // Fuse convolution with post ops is only supported for conv1x1, which is only implemented as gemmconv2d
- std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::GEMMConvolutionLayer>(
- std::string("GEMMConvolutionLayer"), mm,
- input, weights, biases, output, conv_info,
- WeightsInfo(), Size2D(1U, 1U), fused_act, num_groups, post_ops);
-
- // Log info
- std::ostringstream qss;
- if(is_quantized)
- {
- qss << " Input QuantInfo: " << input->info()->quantization_info()
- << " Weights QuantInfo: " << weights->info()->quantization_info()
- << " Output QuantInfo: " << output->info()->quantization_info();
- }
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
- << node.name()
- << " Type: " << func_name
- << " Target: " << TargetInfo::TargetType
- << " Data Type: " << input->info()->data_type()
- << " Groups: " << num_groups
- << " Input shape: " << input->info()->tensor_shape()
- << " Weights shape: " << weights->info()->tensor_shape()
- << " Output shape: " << output->info()->tensor_shape()
- << qss.str()
- << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
- << " Post ops" << post_ops
- << std::endl);
- return std::move(func);
-}
-
-/** Create a backend convolution batch normalization layer function with post operator
- *
- * @tparam FusedLayerTypes Backend convolution functions
- * @tparam TargetInfo Target-specific information
- *
- * @param[in] node Node to create the backend function for
- * @param[in] ctx Graph context
- *
- * @return Backend fused convolution with batch normalization layer function
- */
-template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_with_post_op(FusedConvolutionBatchNormalizationWithPostOpsNode &node, GraphContext &ctx)
-{
- validate_node<TargetInfo>(node, 8 /* expected inputs */, 1 /* expected outputs */);
-
- // Extract IO and info
- typename TargetInfo::TensorType *input = get_backing_tensor<TargetInfo>(node.input(0));
- typename TargetInfo::TensorType *weights = get_backing_tensor<TargetInfo>(node.input(1));
- typename TargetInfo::TensorType *biases = get_backing_tensor<TargetInfo>(node.input(2));
- typename TargetInfo::TensorType *mean = get_backing_tensor<TargetInfo>(node.input(3));
- typename TargetInfo::TensorType *var = get_backing_tensor<TargetInfo>(node.input(4));
- typename TargetInfo::TensorType *beta = get_backing_tensor<TargetInfo>(node.input(5));
- typename TargetInfo::TensorType *gamma = get_backing_tensor<TargetInfo>(node.input(6));
-
- typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
-
- const PadStrideInfo conv_info = node.convolution_info();
- const unsigned int num_groups = node.num_groups();
- const bool fast_math = node.fast_math_hint() == FastMathHint::Enabled;
- const float epsilon = node.epsilon();
-
- experimental::PostOpList<typename TargetInfo::TensorType *> post_ops;
-
- auto &post_op_info_list = node.post_op_info_list();
- for(const auto &post_op_info : post_op_info_list)
- {
- switch(post_op_info->type())
- {
- case PostOpType::Activation:
- {
- const auto act_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoActivation *>(post_op_info.get());
- post_ops.template push_back_op<experimental::PostOpAct<typename TargetInfo::TensorType *>>(act_info->_act);
- break;
- }
- case PostOpType::Eltwise_Add:
- {
- typename TargetInfo::TensorType *add_input = get_backing_tensor<TargetInfo>(node.input(3));
- const auto eltwise_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoEltwiseAdd *>(post_op_info.get());
- post_ops.template push_back_op<experimental::PostOpEltwiseAdd<typename TargetInfo::TensorType *>>(add_input, eltwise_info->_prev_op_dst_pos, eltwise_info->_policy);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported PostOpType");
- }
- }
- }
-
- // Create and configure function (we assume that functions have been validated before creation)
- std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, TargetInfo::TargetType);
- std::unique_ptr<IFunction> func;
- std::string func_name;
-
- using FType = FusedConvolutionBatchNormalizationWithPostOpsFunction<TargetInfo, FusedLayerTypes>;
-
- // Create and configure function
- std::tie(func, func_name) = create_named_memory_managed_function<FType>(
- std::string("FusedConvolutionBatchNormalizationLayerWithPostOpsLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, num_groups, fast_math, post_ops);
-
- // Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
- << node.name()
- << " Type: " << node.type()
- << " Target: " << TargetInfo::TargetType
- << " Data Type: " << input->info()->data_type()
- << " Input shape: " << input->info()->tensor_shape()
- << " Weights shape: " << weights->info()->tensor_shape()
- << " Output shape: " << output->info()->tensor_shape()
- << " Post Ops:" << post_ops
- << std::endl);
- return std::move(func);
-}
-
/** Create a backend deconvolution layer function
*
* @tparam DeconvolutionLayerFunction Backend deconvolution function
@@ -2025,4 +1845,4 @@ std::unique_ptr<IFunction> create_strided_slice_layer(StridedSliceLayerNode &nod
} // namespace graph
} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
diff --git a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h
deleted file mode 100644
index 10f2e5c25e..0000000000
--- a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-namespace backends
-{
-/** Wrapper function to first apply {NE, CL}BatchNormalizationLayer on the weights and then run {NE, CL}ConvolutionLayer with the modified weights */
-template <typename TargetInfo, typename FusedLayerTypes>
-class FusedConvolutionBatchNormalizationWithPostOpsFunction : public IFunction
-{
-public:
- using TensorType = typename TargetInfo::TensorType;
- using TensorConcreteType = typename TargetInfo::TensorConcreteType;
-
- FusedConvolutionBatchNormalizationWithPostOpsFunction(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
- : _conv_layer(memory_manager), _fused_batch_norm_layer(), _fused_bias(), _is_prepared(false)
- {
- }
-
- /** Set the input and output tensors.
- *
- * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: QASYMM8/F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
- * @param[in] bias Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type.
- * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p input.
- * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
- * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
- * @param[in] beta Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
- * @param[in] gamma Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
- * @param[in] epsilon Small value to avoid division with zero. Default value is 0.001f.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] num_groups Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- * @param[in] fast_math Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
- * available which may introduce a drop of accuracy as well. Default is false
- * @param[in] post_ops A sequence of post operations that are performed after the main operation.
- *
- */
- void configure(TensorType *input,
- TensorType *weights,
- TensorType *bias,
- TensorType *output,
- const TensorType *mean,
- const TensorType *var,
- const TensorType *beta,
- const TensorType *gamma,
- float epsilon, const PadStrideInfo &conv_info, unsigned int num_groups, bool fast_math,
- const arm_compute::experimental::PostOpList<TensorType *> &post_ops = experimental::PostOpList<TensorType *> {})
- {
- // We don't run any validate, as we assume that the layers have been already validated
- const bool has_bias = (bias != nullptr);
- const TensorType *bias_to_use;
-
- // We check if the layer has a bias. If yes, use it in-place. If not, we need to create one
- // as batch normalization might end up with a bias != 0
- if(has_bias)
- {
- _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon);
- bias_to_use = bias;
- }
- else
- {
- _fused_batch_norm_layer.configure(weights, mean, var, nullptr, &_fused_bias, nullptr, beta, gamma, epsilon);
- bias_to_use = &_fused_bias;
- }
-
- ActivationLayerInfo fused_act = ActivationLayerInfo(); // Passing an empty ActivationLayerInfo.
- _conv_layer.configure(input, weights, bias_to_use, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act, fast_math, num_groups, post_ops);
-
- if(!has_bias)
- {
- _fused_bias.allocator()->allocate();
- }
- }
-
- // Inherited methods overridden:
- void run()
- {
- prepare();
- _conv_layer.run();
- }
-
- void prepare()
- {
- if(!_is_prepared)
- {
- _fused_batch_norm_layer.run();
- _is_prepared = true;
- }
- }
-
-private:
- typename FusedLayerTypes::ConvolutionLayer _conv_layer;
- typename FusedLayerTypes::FuseBatchNormalization _fused_batch_norm_layer;
- TensorConcreteType _fused_bias;
- bool _is_prepared;
-};
-} // namespace backends
-} // namespace graph
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H */
diff --git a/arm_compute/graph/backends/ValidateHelpers.h b/arm_compute/graph/backends/ValidateHelpers.h
index 89dccd88b7..71a6201554 100644
--- a/arm_compute/graph/backends/ValidateHelpers.h
+++ b/arm_compute/graph/backends/ValidateHelpers.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
+#define ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
#include "arm_compute/graph/Logger.h"
#include "arm_compute/graph/Tensor.h"
@@ -183,42 +183,6 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
return status;
}
-/** Validates a Convolution layer node
- *
- * @tparam GEMMConvolutionLayer GEMM Convolution layer function type
- *
- * @param[in] node Node to validate
- *
- * @return Status
- */
-template <typename GEMMConvolutionLayer>
-Status validate_fused_convolution_with_post_op(FusedConvolutionWithPostOpNode &node)
-{
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating fused ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
- ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 4);
- ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
-
- // Extract IO and info
- arm_compute::ITensorInfo *input = get_backing_tensor_info(node.input(0));
- arm_compute::ITensorInfo *weights = get_backing_tensor_info(node.input(1));
- arm_compute::ITensorInfo *biases = get_backing_tensor_info(node.input(2));
- arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
-
- if(is_data_type_quantized_asymmetric(input->data_type()))
- {
- biases->set_data_type(DataType::S32);
- }
-
- const PadStrideInfo conv_info = node.convolution_info();
- //const ConvolutionMethod conv_algorithm = node.convolution_method();
- //const bool fast_math = node.fast_math_hint() == FastMathHint::Enabled;
- const unsigned int num_groups = node.num_groups();
-
- // Validate function
- return GEMMConvolutionLayer::validate(input, weights, biases, output, conv_info,
- WeightsInfo(), Size2D(1, 1), ActivationLayerInfo(), num_groups);
-}
-
/** Validates a Depthwise Convolution layer node
*
* @tparam DepthwiseConvolutionLayer Default Depthwise Convolution layer type
@@ -775,4 +739,4 @@ Status validate_unary_eltwise_layer(UnaryEltwiseLayerNode &node)
} // namespace graph
} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
diff --git a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h
deleted file mode 100644
index a42e06d889..0000000000
--- a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_BATCH_NORMALIZATION_WITH_POST_OPS_NODE_H
-#define ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_BATCH_NORMALIZATION_WITH_POST_OPS_NODE_H
-
-#include "arm_compute/graph/INode.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-/** Batch Normalization node */
-class FusedConvolutionBatchNormalizationWithPostOpsNode final : public INode
-{
-public:
- /** Constructor
- *
- * @param[in] epsilon Epsilon parameter.
- * @param[in] info Convolution layer attributes.
- * @param[in] num_groups (Optional) Number of groups (Defaults to 1)
- * @param[in] method (Optional) Convolution method to use
- * @param[in] fast_math_hint (Optional) Fast math hint
- */
- FusedConvolutionBatchNormalizationWithPostOpsNode(float epsilon, PadStrideInfo info,
- unsigned int num_groups = 1,
- ConvolutionMethod method = ConvolutionMethod::Default,
- FastMathHint fast_math_hint = FastMathHint::Disabled);
-
- /** Epsilon parameter accessor
- *
- * @return Epsilon parameter
- */
- float epsilon() const;
-
- /** Computes convolution output descriptor
- *
- * @param[in] input_descriptor Input descriptor
- * @param[in] weights_descriptor Weights descriptor
- * @param[in] info Convolution operation attributes
- *
- * @return Output descriptor
- */
- static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
- const TensorDescriptor &weights_descriptor,
- const PadStrideInfo &info);
-
- /** Sets the convolution layer method to use
- *
- * @param[in] method Method to use for convolution
- */
- void set_convolution_method(ConvolutionMethod method);
-
- /** Number of groups in convolution accessor
- *
- * @return Number of groups in convolution
- */
- unsigned int num_groups() const;
-
- /** Convolution layer method accessor
- *
- * @note This is an indication on which convolution layer implementation to use,
- * if it fails to be created the library's heuristic approach will be used
- *
- * @return Convolution layer method to be used by the node
- */
- ConvolutionMethod convolution_method() const;
-
- /** Sets the fast math hint
- *
- * @param[in] hint Hint to use for convolution
- */
- void set_fast_math_hint(FastMathHint hint);
-
- /** Fast math hint accessor
- *
- * @return Fast math hint to be used by the node
- */
- FastMathHint fast_math_hint() const;
-
- /** Convolution metadata accessor
- *
- * @return Convolution information
- */
- PadStrideInfo convolution_info() const;
-
- // Inherited overridden methods:
- NodeType type() const override;
- bool forward_descriptors() override;
- TensorDescriptor configure_output(size_t idx) const override;
- void accept(INodeVisitor &v) override;
-
-public:
- static constexpr NodeType node_type = NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer;
-
-private:
- float _epsilon;
-
- PadStrideInfo _info;
- unsigned int _num_groups;
- ConvolutionMethod _method;
- FastMathHint _fast_math_hint;
-};
-
-} // namespace graph
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_BATCH_NORMALIZATION_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h b/arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h
deleted file mode 100644
index 6048994b02..0000000000
--- a/arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_WITH_POST_OP_NODE_H
-#define ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_WITH_POST_OP_NODE_H
-
-#include "arm_compute/graph/INode.h"
-
-#include <list>
-
-namespace arm_compute
-{
-namespace graph
-{
-/** Convolution node */
-class FusedConvolutionWithPostOpNode final : public INode
-{
-public:
- /** Constructor
- *
- * @param[in] info Convolution layer attributes
- * @param[in] num_groups (Optional) Number of groups (Defaults to 1)
- * @param[in] method (Optional) Convolution method to use
- * @param[in] fast_math_hint (Optional) Fast math hint
- * @param[in] out_quant_info (Optional) Output quantization info
- */
- FusedConvolutionWithPostOpNode(PadStrideInfo info,
- unsigned int num_groups = 1,
- ConvolutionMethod method = ConvolutionMethod::Default,
- FastMathHint fast_math_hint = FastMathHint::Disabled,
- QuantizationInfo out_quant_info = QuantizationInfo());
- /** Sets the convolution layer method to use
- *
- * @param[in] method Method to use for convolution
- */
- void set_convolution_method(ConvolutionMethod method);
- /** Convolution layer method accessor
- *
- * @note This is an indication on which convolution layer implementation to use,
- * if it fails to be created the library's heuristic approach will be used
- *
- * @return Convolution layer method to be used by the node
- */
- ConvolutionMethod convolution_method() const;
- /** Sets the fast math fast hint
- *
- * @param[in] hint Hint to use for convolution
- */
- void set_fast_math_hint(FastMathHint hint);
- /** Fast math hint accessor
- *
- * @return Fast math hint to be used by the node
- */
- FastMathHint fast_math_hint() const;
- /** Convolution metadata accessor
- *
- * @return Convolution information
- */
- PadStrideInfo convolution_info() const;
- /** Number of groups in convolution accessor
- *
- * @return Number of groups in convolution
- */
- unsigned int num_groups() const;
- /** Returns fused activation
- *
- * @return Fused activation
- */
- ActivationLayerInfo fused_activation() const;
- /** Sets fused activation
- *
- * @param[in] fused_activation Fused activation to set
- */
- void set_fused_activation(ActivationLayerInfo fused_activation);
- /** Sets convolution info
- *
- * @param[in] info Convolution info to set
- */
- void set_convolution_info(PadStrideInfo info);
- /** Computes convolution output descriptor
- *
- * @param[in] input_descriptor Input descriptor
- * @param[in] weights_descriptor Weights descriptor
- * @param[in] info Convolution operation attributes
- *
- * @return Output descriptor
- */
- static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
- const TensorDescriptor &weights_descriptor,
- const PadStrideInfo &info);
-
- // Inherited overridden methods:
- NodeType type() const override;
- bool forward_descriptors() override;
- TensorDescriptor configure_output(size_t idx) const override;
- void accept(INodeVisitor &v) override;
-
-public:
- static constexpr NodeType node_type = NodeType::FusedConvolutionWithPostOp;
-
-private:
- PadStrideInfo _info;
- unsigned int _num_groups;
- ConvolutionMethod _method;
- FastMathHint _fast_math_hint;
- QuantizationInfo _out_quant_info;
- ActivationLayerInfo _fused_activation;
-};
-} // namespace graph
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_WITH_POST_OP_NODE_H */
diff --git a/arm_compute/graph/nodes/Nodes.h b/arm_compute/graph/nodes/Nodes.h
index 3887eaeac6..ae9f177ec4 100644
--- a/arm_compute/graph/nodes/Nodes.h
+++ b/arm_compute/graph/nodes/Nodes.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GRAPH_NODES_H
-#define ARM_COMPUTE_GRAPH_NODES_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
+#define ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
#include "arm_compute/graph/nodes/ActivationLayerNode.h"
#include "arm_compute/graph/nodes/ArgMinMaxLayerNode.h"
@@ -43,8 +43,6 @@
#include "arm_compute/graph/nodes/FlattenLayerNode.h"
#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
#include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
#include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
#include "arm_compute/graph/nodes/InputNode.h"
@@ -70,4 +68,4 @@
#include "arm_compute/graph/nodes/StackLayerNode.h"
#include "arm_compute/graph/nodes/StridedSliceLayerNode.h"
-#endif /* ARM_COMPUTE_GRAPH_NODES_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
diff --git a/arm_compute/graph/nodes/NodesFwd.h b/arm_compute/graph/nodes/NodesFwd.h
index f1576d6336..580f339468 100644
--- a/arm_compute/graph/nodes/NodesFwd.h
+++ b/arm_compute/graph/nodes/NodesFwd.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GRAPH_NODES_FWD_H
-#define ARM_COMPUTE_GRAPH_NODES_FWD_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
+#define ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
namespace arm_compute
{
@@ -49,9 +49,7 @@ class EltwiseLayerNode;
class FlattenLayerNode;
class FullyConnectedLayerNode;
class FusedConvolutionBatchNormalizationNode;
-class FusedConvolutionWithPostOpNode;
class FusedDepthwiseConvolutionBatchNormalizationNode;
-class FusedConvolutionBatchNormalizationWithPostOpsNode;
class GenerateProposalsLayerNode;
class InputNode;
class L2NormalizeLayerNode;
@@ -77,4 +75,4 @@ class StackLayerNode;
class StridedSliceLayerNode;
} // namespace graph
} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_NODES_FWD_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
diff --git a/arm_compute/graph/printers/DotGraphPrinter.h b/arm_compute/graph/printers/DotGraphPrinter.h
index 63b89272f4..564aecfb1e 100644
--- a/arm_compute/graph/printers/DotGraphPrinter.h
+++ b/arm_compute/graph/printers/DotGraphPrinter.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019,2021 Arm Limited.
+ * Copyright (c) 2018-2019,2021,2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H
-#define ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
+#define ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
#include "arm_compute/graph/IGraphPrinter.h"
@@ -57,8 +57,6 @@ public:
void visit(DepthwiseConvolutionLayerNode &n) override;
void visit(EltwiseLayerNode &n) override;
void visit(FusedConvolutionBatchNormalizationNode &n) override;
- void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
- void visit(FusedConvolutionWithPostOpNode &n) override;
void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
void visit(NormalizationLayerNode &n) override;
void visit(PoolingLayerNode &n) override;
@@ -106,4 +104,4 @@ private:
};
} // namespace graph
} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 8c9e45d753..77bf48d613 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -21,12 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CLCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_CLCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
#include "arm_compute/core/CL/CLCompileContext.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/IFunction.h"
@@ -120,11 +119,9 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- * @param[in] post_ops (Optional) A sequence of post operations that are performed after the main operation.
*/
void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1,
- const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+ const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -144,11 +141,10 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- * @param[in] post_ops (Optional) A sequence of post operations that are performed after the main operation.
*/
void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
- unsigned int num_groups = 1, const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+ unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -167,13 +163,12 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- * @param[in] post_ops (Optional) A sequence of post operations that are performed after the main operation.
*
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
- unsigned int num_groups = 1, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+ unsigned int num_groups = 1);
/** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -203,5 +198,5 @@ private:
struct Impl;
std::unique_ptr<Impl> _impl;
};
-}
-#endif /* ARM_COMPUTE_CLCONVOLUTIONLAYER_H */
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 9827340382..4bafef27a9 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -21,10 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
-#include "arm_compute/core/experimental/IPostOp.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTypes.h"
@@ -95,11 +94,9 @@ public:
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- * @param[in] post_ops (Optional) A sequence of post operations that are performed after the main operation.
*/
void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1,
- const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+ const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
/** Set the input and output tensors.
*
* @param[in] compile_context The compile context to be used.
@@ -118,12 +115,10 @@ public:
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- * @param[in] post_ops (Optional) A sequence of post operations that are performed after the main operation.
*/
void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1,
- const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+ const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer.
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -141,13 +136,11 @@ public:
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
- * @param[in] post_ops (Optional) A sequence of post operations that are performed after the main operation.
*
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1,
- const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+ const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
// Inherited methods overridden:
void run() override;
@@ -158,4 +151,4 @@ private:
std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h
index 630f533244..05f172b9f1 100644
--- a/arm_compute/runtime/FunctionDescriptors.h
+++ b/arm_compute/runtime/FunctionDescriptors.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
-#define ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
+#define ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
#include "arm_compute/core/Types.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
@@ -58,24 +58,22 @@ struct Conv2dInfo
{
Conv2dInfo() = default;
- Conv2dInfo(const PadStrideInfo &conv_info,
- const Size2D &dilation,
- const ActivationLayerInfo &act_info,
- bool enable_fast_math,
- unsigned int num_groups,
- const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {},
- const WeightsInfo &weights_info = WeightsInfo())
- : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups), post_ops(post_ops), weights_info(weights_info)
+ Conv2dInfo(const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups,
+ const WeightsInfo &weights_info = WeightsInfo())
+ : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups), weights_info(weights_info)
{
}
- PadStrideInfo conv_info{};
- Size2D dilation{ 1U, 1U };
- ActivationLayerInfo act_info{};
- bool enable_fast_math{ false };
- unsigned int num_groups{ 1 };
- experimental::PostOpList<ITensorInfo *> post_ops{};
- WeightsInfo weights_info{};
+ PadStrideInfo conv_info{};
+ Size2D dilation{ 1U, 1U };
+ ActivationLayerInfo act_info{};
+ bool enable_fast_math{ false };
+ unsigned int num_groups{ 1 };
+ WeightsInfo weights_info{};
};
/** Descriptor used by the 3d Convolution function */
@@ -102,4 +100,4 @@ struct Conv3dInfo
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 04ee10b9f4..0142497adc 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -48,6 +48,7 @@ v23.11 Public major release
- Port the following kernels in the experimental Dynamic Fusion interface to use the new Compute Kernel Writer interface:
- @ref experimental::dynamic_fusion::GpuCkwResize
- Update OpenCLâ„¢ API headers to v2023.04.17.
+ - Remove legacy PostOps interface. PostOps was the experimental interface for kernel fusion and is replaced by the new Dynamic Fusion interface.
- Performance optimizations:
- Optimize @ref cpu::CpuReshape
- Port the following kernels in the experimental Dynamic Fusion interface to use the new Compute Kernel Writer interface with support for FP16/FP32 only:
diff --git a/scripts/format_code.py b/scripts/format_code.py
index fa572cf51c..94c49fdb59 100755
--- a/scripts/format_code.py
+++ b/scripts/format_code.py
@@ -60,7 +60,9 @@ exceptions = [
"/convolution/",
"/arm_gemm/",
"/arm_conv/",
- "compute_kernel_writer/"
+ "compute_kernel_writer/",
+ "SConscript",
+ "SConstruct"
]
def adjust_copyright_year(copyright_years, curr_year):
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index f508b7ee2e..a02739f339 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -72,8 +72,6 @@ filegroup(
"graph/nodes/FlattenLayerNode.cpp",
"graph/nodes/FullyConnectedLayer.cpp",
"graph/nodes/FusedConvolutionBatchNormalizationNode.cpp",
- "graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp",
- "graph/nodes/FusedConvolutionWithPostOpNode.cpp",
"graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp",
"graph/nodes/GenerateProposalsLayerNode.cpp",
"graph/nodes/InputNode.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 76409239ea..39fba860fa 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -73,8 +73,6 @@ target_sources(
graph/nodes/FlattenLayerNode.cpp
graph/nodes/FullyConnectedLayer.cpp
graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
- graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
- graph/nodes/FusedConvolutionWithPostOpNode.cpp
graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
graph/nodes/GenerateProposalsLayerNode.cpp
graph/nodes/InputNode.cpp
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 03f78697bc..7e56a3ba18 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -23,16 +23,14 @@
*/
#include "src/core/CL/CLUtils.h"
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/CL/CLCompileContext.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/StringUtils.h"
#include "support/StringSupport.h"
-#include "src/core/experimental/PostOpUtils.h"
-
namespace arm_compute
{
cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType image_type)
@@ -40,7 +38,7 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
const cl::Context &ctx = CLKernelLibrary::get().context();
- const cl::Buffer &buffer = tensor->cl_buffer();
+ const cl::Buffer &buffer = tensor->cl_buffer();
const ITensorInfo *info = tensor->info();
ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(),
"Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement");
@@ -113,112 +111,4 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
return cl::Image2D(cl_image);
}
-
-namespace experimental
-{
-PostOpCLKernelUtils::PostOpCLKernelUtils(const Config &supported_config)
- : _supported_config(supported_config)
-{
- ARM_COMPUTE_ERROR_ON_MSG(supported_config.empty(), "Empty PostOp CL kernel support configuration is not allowed");
- for(auto it = _supported_config.begin(); it != _supported_config.end(); ++it)
- {
- auto post_op_sequence = it->first;
- auto post_op_slots = std::get<1>(it->second);
- ARM_COMPUTE_ERROR_ON_MSG(post_op_sequence.size() != post_op_slots.size(), "The number of PostOps must be the same as that of the assigned slots");
- }
-}
-
-bool PostOpCLKernelUtils::are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops)
-{
- for(const auto &op : post_ops.get_list())
- {
- for(const auto &tensor : op->arguments())
- {
- const TensorShape &out_shape = TensorShape::broadcast_shape(dst->tensor_shape(), (*tensor)->tensor_shape());
- // All post ops must be elementwise and must not alter the shape of the original dst tensor after broadcasting
- if(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0))
- {
- return false;
- }
- // NOTE: Kernel limitation: currently only the following broadcasting types are supported:
- // 1. Post op arg is scalar, broadcast in both first and second dims
- // 2. Post op arg is of shape: second dim=1, first dim=N, broadcast only in second dim
- // This means this case: Post op arg is of shape: second dim=M, first dim=1, broadcast only in first dim, is NOT supported
- if(dst->dimension(0) > 1 && dst->dimension(1) > 1 && (*tensor)->dimension(0) == 1 && (*tensor)->dimension(1) > 1)
- {
- return false;
- }
- }
- }
- return true;
-}
-
-bool PostOpCLKernelUtils::is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const
-{
- if(post_ops.size() == 0)
- {
- return true; // Always support cases where no post op is specified
- }
- const auto post_op_sequence = get_post_op_sequence(post_ops);
-
- return _supported_config.find(post_op_sequence) != _supported_config.end();
-}
-
-void PostOpCLKernelUtils::set_post_ops_cl_build_options(CLBuildOptions &build_opts, const PostOpList<ITensorInfo *> &post_ops) const
-{
- const auto post_op_sequence = get_post_op_sequence(post_ops);
- const auto slots = std::get<1>(_supported_config.at(post_op_sequence));
- for(size_t post_op_id = 0; post_op_id < post_ops.size(); ++post_op_id)
- {
- const auto &post_op = post_ops.get_list().at(post_op_id);
- const auto slot_prefix = "-DP" + support::cpp11::to_string(slots[post_op_id]);
- if(post_op->type() == experimental::PostOpType::Activation)
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<ITensorInfo *> *>(post_op.get());
- const auto act_type = slot_prefix + "_ACTIVATION_TYPE=" + lower_string(string_from_activation_func(_post_op->_act_info.activation()));
- const auto act_a_val = slot_prefix + "_ACTIVATION_A_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.a());
- const auto act_b_val = slot_prefix + "_ACTIVATION_B_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.b());
- build_opts.add_option(act_type);
- build_opts.add_option(act_a_val);
- build_opts.add_option(act_b_val);
- }
- else if(post_op->type() == experimental::PostOpType::Eltwise_Add)
- {
- size_t arg_id = 1;
- const auto eltwise_op = slot_prefix + "_ELTWISE_OP=ADD" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
- build_opts.add_option(eltwise_op);
- for(const auto &tensor : post_op->arguments())
- {
- const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
- const auto width = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
- build_opts.add_option(height);
- build_opts.add_option(width);
- ++arg_id;
- }
- }
- else if(post_op->type() == experimental::PostOpType::Eltwise_PRelu)
- {
- size_t arg_id = 1;
- const auto eltwise_op = slot_prefix + "_ELTWISE_OP=PRELU" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
- build_opts.add_option(eltwise_op);
- for(const auto &tensor : post_op->arguments())
- {
- const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
- const auto width = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
- build_opts.add_option(height);
- build_opts.add_option(width);
- ++arg_id;
- }
- }
- }
-}
-
-void PostOpCLKernelUtils::set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const
-{
- const auto post_op_sequence = get_post_op_sequence(post_ops);
- const auto postfix = std::get<0>(_supported_config.at(post_op_sequence));
- kernel_name += postfix;
-}
-} // namespace experimental
-
} // namespace arm_compute
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index e3f12d4b53..f0e79bccfc 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -22,11 +22,10 @@
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_CLUTILS_H
-#define ARM_COMPUTE_CL_CLUTILS_H
+#ifndef ACL_SRC_CORE_CL_CLUTILS_H
+#define ACL_SRC_CORE_CL_CLUTILS_H
#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/experimental/IPostOp.h"
#include <map>
@@ -74,88 +73,6 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
* @return cl::Image2D object
*/
cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type);
+} // namespace arm_compute
-namespace experimental
-{
-/** @name (EXPERIMENTAL_POST_OPS)
- * @{
- */
-
-/** Manage validation, building and configurations of PostOp CL kernels */
-class PostOpCLKernelUtils final
-{
-public:
- /** CL kernel name postfix for post ops */
- using NamePostfix = std::string;
- /** CL kernels that supports post ops assign each post op to a 'slot', in accordance with the postfix
- * For example, for a kernel with postfix '_act_prelu_eltwiseadd', there are 3 slots
- * slot 1: (unary) activation, slot 2: pRelu, slot 3: elementwise addition
- *
- * Some kernels may allow some slots to be optional, to support multiple combinations of post op sequences.
- * In such cases, we need to explicitly set up a mapping between each post op and the slots for that kernel.
- * For example, suppose we have 2 kernels with postfixes: _eltwiseadd_prelu, _act_eltwiseadd_act_prelu, where the activations in the
- * second kernel are optional. Say we want to support an eltwise addition, followed by a prelu (sequence { eltwiseadd, prelu }).
- * Now we can choose which one of the 2 kernels to use, since they both support this post op sequence.
- * We can either:
- * 1. assign the elementwise to slot 1 and prelu to slot 2 of kernel 1
- * { { Eltwise_Add, PRelu } -> {"_eltwise_act", {1, 2} } } or
- * 2. assign the elementwise to slot 2 and prelu to slot 4 of kernel 1
- * { { Eltwise_Add, PRelu } -> {"_act_eltwiseadd_act_prelu", {2, 4} } }
- */
- using Slots = std::vector<unsigned int>;
- using Config = std::map<PostOpTypeSequence, std::tuple<NamePostfix, Slots>>;
-
-public:
- explicit PostOpCLKernelUtils(const Config &config);
-
- /** Check if post op argument tensor shapes are compliant
- * All post ops must not alter the shape of the original dst tensor (even after broadcasting)
- *
- * @param[in] dst Dst tensor to apply the post ops to
- * @param[in] post_ops Post ops
- *
- * @return true if shapes are compliant and false otherwise
- */
- static bool are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops);
- /** Check if the post op sequence is supported in the current configuration
- *
- * @param[in] post_ops Post ops
- *
- * @return true if the post op sequence is supported and false otherwise
- */
- bool is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const;
- /** Helper function to set PostOp related build options
- * @note Convention
- * 1. Each post op "slot" is prefixed with "P<slot number>", followed by the usual parameters for that post op.
- * E.g. If the first slot is an activation, we need to pass 3 definitions in this way:
- * -P1_ACTIVATION_TYPE=... -P1_ACTIVATION_A_VAL=... -P1_ACTIVATION_B_VAL=...
- *
- * 2. For multi-ary post ops, to pass the position of the previous op's dest tensor,
- * we append "_X_POS_<pos>" to the post op type.
- * E.g. for a single post op add(dst, x), where dst is the result of the main op.
- * In this case, the position of the previous op's dest is 0, so we pass
- * -P1_ELTWISE_OP=ADD_X_POS_0
- *
- * @param[out] built_opts OpenCL kernel build options
- * @param[in] post_ops Post ops
- *
- */
- void set_post_ops_cl_build_options(CLBuildOptions &built_opts, const PostOpList<ITensorInfo *> &post_ops) const;
- /** Helper function to set PostOp kernel name
- *
- * @param[out] kernel_name OpenCL kernel name
- * @param[in] post_ops Post ops
- *
- */
- void set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const;
-
-private:
- Config _supported_config{};
-};
-/** @} */ // end of group (EXPERIMENTAL_POST_OPS)
-
-} // namespace experimental
-
-} // arm_compute
-
-#endif /* ARM_COMPUTE_CL_CLUTILS_H */
+#endif // ACL_SRC_CORE_CL_CLUTILS_H
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
deleted file mode 100644
index 2c2d60ed13..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h"
-
-/** (EXPERIMENTAL_POST_OPS) Post Op expansions for the post op sequence:
- * act (optional): POST_OP1_ACTIVATION_OPTIONAL
- * eltwise_op : POST_OP2_ELTWISE_OP
- * act (optional): POST_OP3_ACTIVATION_OPTIONAL
- */
-
-/** Post Op 1: Activation Block (Optional)
- * @name POST_OP1_ACTIVATION_OPTIONAL
- * Toggled by -DP1_ACTIVATION_TYPE
- * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
- * @{
- */
-#if defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
- MIXED_PRECISION_ACTIVATION_BLOCK(N, P1_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
-#else // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
-#endif // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-/** @} */ // end of group POST_OP1_ACTIVATION_OPTIONAL
-
-/** Post Op 2: Eltwise Op Block
- * Handles both broadcasting and non-broadcasting cases
- * @name POST_OP2_ELTWISE_OP
- *
- * @param[in] P2_ELTWISE_ARG1_HEIGHT Height (number of rows) of the @ref ELTWISE_OPERAND_NAME tensor
- * @param[in] P2_ELTWISE_ARG1_WIDTH Width (number of columns) of the @ref ELTWISE_OPERAND_NAME tensor
- * @param[in] OP The elementwise post op
- * @param[in] M0 The number of consecutive rows
- * @param[in] N0 The number of consecutive columns
- * @param[in] BASENAME The basename of the result variables
- * @param[in] ELTWISE_OPERAND_NAME The basename of the other operand variables
- * @param[in] ELTWISE_OPERAND_ROW The starting row of the other operand variables. Required as different boundary handling strategies are used by different kernels
- * E.g. reshaped_only_rhs and native kernels shifts rows (by using COMPUTE_M0_START_ROW) to handle boundary rows,
- * whereas reshaped kernels do not shift rows
- * @param[in] DATA_TYPE Data type of the result variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] ZERO Zero vector for z offset
- * @param[in] PARTIAL_LOAD_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
- * @param[in] PARTIAL_LOAD_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
- * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_LOAD_M0 rather than M0.
- * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_LOAD_N0 rather than N0.
- * @{
- */
-#if defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#if P2_ELTWISE_ARG1_HEIGHT == 1
-#if P2_ELTWISE_ARG1_WIDTH == 1 // Case 1: Broadcasting in both X and Y; op2 arg tile shape[YxX] == [1x1]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
- VEC_DATA_TYPE(DATA_TYPE, 1) \
- ELTWISE_OPERAND_NAME##0 = VLOAD(1)(0, (__global DATA_TYPE *)ELTWISE_OPERAND_NAME##_addr); \
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, 1, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#else // P2_ELTWISE_ARG1_WIDTH == 1; Case 2: Broadcasting in only Y; op2 arg tile shape[YxX] == [1xN0]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, 1, PARTIAL_LOAD_N0, false, PARTIAL_COND_X); \
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#endif // P2_ELTWISE_ARG1_WIDTH == 1
-#else // P2_ELTWISE_ARG1_HEIGHT == 1; Case 3: No broadcasting; op2 arg tile shape[YxX] == [M0xN0]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
- __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (ELTWISE_OPERAND_ROW * ELTWISE_OPERAND_NAME##_stride_y) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X); \
- MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#endif // P2_ELTWISE_ARG1_HEIGHT == 1
-#endif // defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-/** @} */ // end of group POST_OP2_ELTWISE_OP
-/** Post Op 3: Activation Block (Optional)
- * @name POST_OP3_ACTIVATION_OPTIONAL
- * Toggled by -DP3_ACTIVATION_TYPE
- * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
- * @{
- */
-#if defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
- MIXED_PRECISION_ACTIVATION_BLOCK(N, P3_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
-#else // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
-#endif // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-/** @} */ // end of group POST_OP3_ACTIVATION_OPTIONAL
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
deleted file mode 100644
index 22ae098772..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h"
-#include "common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h"
-#include "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h"
-
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_native kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#define VFMA(a, b, c) \
- ({ \
- c = fma(a, b, c); \
- })
-
-#if M0 == 1
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- })
-#elif M0 == 2 // M0 == 2
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- })
-#elif M0 == 3 // M0 == 3
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- })
-#elif M0 == 4 // M0 == 4
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- })
-#elif M0 == 5 // M0 == 5
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- })
-#elif M0 == 6 // M0 == 6
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- })
-#elif M0 == 7 // M0 == 7
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- })
-#elif M0 == 8 // M0 == 8
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
- })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-#if defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_native, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_native_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z,
- const int M,
- const int N,
- const int K
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- )
-{
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
- // RHS offset and step X
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
- // Compute RHS matrix address
- uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
- // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
-
- int i = 0;
-#if K0 > 1
- for(; i <= (K - K0); i += K0)
- {
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
- // Load values from RHS matrix
- LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
-
- RHS_VFMA_M0xN0(0, a, b0, c);
- RHS_VFMA_M0xN0(1, a, b1, c);
-#if K0 > 2
- RHS_VFMA_M0xN0(2, a, b2, c);
-#endif // K0 > 2
-#if K0 > 3
- RHS_VFMA_M0xN0(3, a, b3, c);
-#endif // K0 > 3
-#if K0 > 4
- RHS_VFMA_M0xN0(4, a, b4, c);
- RHS_VFMA_M0xN0(5, a, b5, c);
- RHS_VFMA_M0xN0(6, a, b6, c);
- RHS_VFMA_M0xN0(7, a, b7, c);
-#endif // K0 > 4
-#if K0 > 8
- RHS_VFMA_M0xN0(8, a, b8, c);
- RHS_VFMA_M0xN0(9, a, b9, c);
- RHS_VFMA_M0xN0(A, a, bA, c);
- RHS_VFMA_M0xN0(B, a, bB, c);
- RHS_VFMA_M0xN0(C, a, bC, c);
- RHS_VFMA_M0xN0(D, a, bD, c);
- RHS_VFMA_M0xN0(E, a, bE, c);
- RHS_VFMA_M0xN0(F, a, bF, c);
-#endif // K0 > 8
-
- lhs_offset += K0 * sizeof(DATA_TYPE);
- rhs_offset += K0 * rhs_stride_y;
- }
-#endif // K0 > 1
- // Left-over accumulations
- for(; i < K; ++i)
- {
- // Load values from LHS matrix
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
-#if M0 > 1
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
-#endif // M0 > 1
-#if M0 > 2
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
-#endif // M0 > 2
-#if M0 > 3
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
-#endif // M0 > 3
-#if M0 > 4
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
-#endif // M0 > 4
-#if M0 > 5
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
-#endif // M0 > 5
-#if M0 > 6
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
-#endif // M0 > 6
-#if M0 > 7
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
-#endif // M0 > 7
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
- RHS_VFMA_M0xN0(0, a, b, c);
-
- lhs_offset += sizeof(DATA_TYPE);
- rhs_offset += rhs_stride_y;
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- const bool cond_y = y == 0;
- const bool cond_x = ((x + 1) * N0 >= N);
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
deleted file mode 100644
index 89577e9ebd..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
+++ /dev/null
@@ -1,1424 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "fp_post_ops_act_eltwise_op_act.h"
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped kernel */
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#if defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c += a.s0 * b.s0; \
- c += a.s1 * b.s1; \
- })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c += a.s0 * b.s0; \
- c += a.s1 * b.s1; \
- c += a.s2 * b.s2; \
- })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c += a.s0 * b.s0; \
- c += a.s1 * b.s1; \
- c += a.s2 * b.s2; \
- c += a.s3 * b.s3; \
- })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c += a.s0 * b.s0; \
- c += a.s1 * b.s1; \
- c += a.s2 * b.s2; \
- c += a.s3 * b.s3; \
- c += a.s4 * b.s4; \
- c += a.s5 * b.s5; \
- c += a.s6 * b.s6; \
- c += a.s7 * b.s7; \
- })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c += a.s0 * b.s0; \
- c += a.s1 * b.s1; \
- c += a.s2 * b.s2; \
- c += a.s3 * b.s3; \
- c += a.s4 * b.s4; \
- c += a.s5 * b.s5; \
- c += a.s6 * b.s6; \
- c += a.s7 * b.s7; \
- c += a.s8 * b.s8; \
- c += a.s9 * b.s9; \
- c += a.sA * b.sA; \
- c += a.sB * b.sB; \
- c += a.sC * b.sC; \
- c += a.sD * b.sD; \
- c += a.sE * b.sE; \
- c += a.sF * b.sF; \
- })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#else // defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- c = fma(a.s2, b.s2, c); \
- })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- c = fma(a.s2, b.s2, c); \
- c = fma(a.s3, b.s3, c); \
- })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- c = fma(a.s2, b.s2, c); \
- c = fma(a.s3, b.s3, c); \
- c = fma(a.s4, b.s4, c); \
- c = fma(a.s5, b.s5, c); \
- c = fma(a.s6, b.s6, c); \
- c = fma(a.s7, b.s7, c); \
- })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- c = fma(a.s2, b.s2, c); \
- c = fma(a.s3, b.s3, c); \
- c = fma(a.s4, b.s4, c); \
- c = fma(a.s5, b.s5, c); \
- c = fma(a.s6, b.s6, c); \
- c = fma(a.s7, b.s7, c); \
- c = fma(a.s8, b.s8, c); \
- c = fma(a.s9, b.s9, c); \
- c = fma(a.sA, b.sA, c); \
- c = fma(a.sB, b.sB, c); \
- c = fma(a.sC, b.sC, c); \
- c = fma(a.sD, b.sD, c); \
- c = fma(a.sE, b.sE, c); \
- c = fma(a.sF, b.sF, c); \
- })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#endif // defined(MIXED_PRECISION)
-
-#if defined(ARM_DOT_K0XN0)
-#undef ARM_DOT_K0XN0
-#endif // defined(ARM_DOT_K0XN0)
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(a, b, c) \
- ({ \
- ARM_DOT_K0((a), (b##0), (c.s0)); \
- ARM_DOT_K0((a), (b##1), (c.s1)); \
- })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(a, b, c) \
- ({ \
- ARM_DOT_K0((a), (b##0), (c.s0)); \
- ARM_DOT_K0((a), (b##1), (c.s1)); \
- ARM_DOT_K0((a), (b##2), (c.s2)); \
- })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(a, b, c) \
- ({ \
- ARM_DOT_K0((a), (b##0), (c.s0)); \
- ARM_DOT_K0((a), (b##1), (c.s1)); \
- ARM_DOT_K0((a), (b##2), (c.s2)); \
- ARM_DOT_K0((a), (b##3), (c.s3)); \
- })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(a, b, c) \
- ({ \
- ARM_DOT_K0((a), (b##0), (c.s0)); \
- ARM_DOT_K0((a), (b##1), (c.s1)); \
- ARM_DOT_K0((a), (b##2), (c.s2)); \
- ARM_DOT_K0((a), (b##3), (c.s3)); \
- ARM_DOT_K0((a), (b##4), (c.s4)); \
- ARM_DOT_K0((a), (b##5), (c.s5)); \
- ARM_DOT_K0((a), (b##6), (c.s6)); \
- ARM_DOT_K0((a), (b##7), (c.s7)); \
- })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(a, b, c) \
- ({ \
- ARM_DOT_K0((a), (b##0), (c.s0)); \
- ARM_DOT_K0((a), (b##1), (c.s1)); \
- ARM_DOT_K0((a), (b##2), (c.s2)); \
- ARM_DOT_K0((a), (b##3), (c.s3)); \
- ARM_DOT_K0((a), (b##4), (c.s4)); \
- ARM_DOT_K0((a), (b##5), (c.s5)); \
- ARM_DOT_K0((a), (b##6), (c.s6)); \
- ARM_DOT_K0((a), (b##7), (c.s7)); \
- ARM_DOT_K0((a), (b##8), (c.s8)); \
- ARM_DOT_K0((a), (b##9), (c.s9)); \
- ARM_DOT_K0((a), (b##A), (c.sA)); \
- ARM_DOT_K0((a), (b##B), (c.sB)); \
- ARM_DOT_K0((a), (b##C), (c.sC)); \
- ARM_DOT_K0((a), (b##D), (c.sD)); \
- ARM_DOT_K0((a), (b##E), (c.sE)); \
- ARM_DOT_K0((a), (b##F), (c.sF)); \
- })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
- if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
- (get_global_id(2) * lhs_stride_z);
-
- // Compute RHS matrix address
- __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_addr += get_global_id(2) * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
- for(int i = 0; i < K; i += K0)
- {
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
- // Load values from RHS matrix
- LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
- // Accumulate
- ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
- lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
- rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
- // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
- const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
- const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
- 2) * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- __read_only image2d_t rhs_img,
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
- // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
- // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
- if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
- (get_global_id(2) * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else // defined(MATRIX_B_DEPTH)
- const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
- // Compute RHS matrix coordinates
- uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
- const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
- for(int i = 0; i < K; i += K0)
- {
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
- // Load values from RHS matrix stored in a cl_image
- REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
- LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
- // Accumulate
- ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
- lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-
- x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
- // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
- const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
- const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
- 2) * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(LHS_TRANSPOSE)
-
-#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
-
-#if defined(MIXED_PRECISION)
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#else // defined(MIXED_PRECISION
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#endif // defined(MIXED_PRECISION)
-
-#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \
- ({ \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
- })
-#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \
- ({ \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
- })
-#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \
- ({ \
- ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
- })
-#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \
- ({ \
- ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
- })
-#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \
- ({ \
- ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
- ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
- })
-
-// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
-// a is the column-vector (transposed)
-// b is the row-vector (not transposed)
-// C is the output matrix
-// Lower case is a vector (a, b)
-// Upper case is a matrix (C)
-#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
-
-#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
- })
-#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
- })
-#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
- })
-#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
- })
-#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
- ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
- })
-#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \
- ({ \
- ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
- ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
- })
-
-// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
-// The dimensions for this matrix multiplications are defined through M0, N0 and K0
-// The dimensions supported are:
-// M0: 1, 2, 3, 4, 8
-// N0: 1, 2, 3, 4, 8, 16
-// K0: 1, 2, 3, 4, 8, 16
-// This macro calls the vector-by-matrix macro K0 times
-// A, B and C are matrices
-#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
- CONCAT(ARM_MM_T_NT_M0xN0x, K0) \
- (M0, N0, TYPE, A, B, C)
-
-#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M Number of rows in LHS matrix not reshaped.
- * @param[in] N Number of columns in RHS matrix not reshaped.
- * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#endif // defined(RHS_INTERLEAVE)
-
- const uint x = get_global_id(0);
- const uint y = get_global_id(1);
- const uint z = get_global_id(2);
-
- // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
- const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
- const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
- // Compute RHS matrix address
- __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_addr += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
- __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
- __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
-
- for(int i = 0; i < K; i += K0)
- {
- VEC_DATA_TYPE(DATA_TYPE, M0)
- a0;
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
-#if K0 > 1
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = VLOAD(N0)(0, rhs);
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
- rhs += RHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
- lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-#ifndef RHS_INTERLEAVE
- rhs += (N0 * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
- 2) * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- __read_only image2d_t rhs_img,
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
- // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#endif // defined(RHS_INTERLEAVE)
-
- const uint x = get_global_id(0);
- const uint y = get_global_id(1);
- const uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else // defined(MATRIX_B_DEPTH)
- const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
- // Compute RHS matrix coordinates
- uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
- const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
- __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-
- for(int i = 0; i < K; i += K0)
- {
- VEC_DATA_TYPE(DATA_TYPE, M0)
- a0;
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
-#if K0 > 1
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-
- a0 = VLOAD(M0)(0, lhs);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-
- ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
- lhs += LHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
- lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
- x_rhs += K0 * RHS_STEP_X;
-#ifndef RHS_INTERLEAVE
- x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
- // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
- const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
- const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#endif // defined(LHS_TRANSPOSE)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
deleted file mode 100644
index 09ddcde043..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
+++ /dev/null
@@ -1,1399 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "fp_post_ops_act_eltwise_op_act.h"
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped_only_rhs kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#define CONCAT(a, b) a##b
-
-#define ARM_DOT1(a, b, c) \
- ({ \
- c = fma(a, b, c); \
- })
-#define ARM_DOT2(a, b, c) \
- ({ \
- c = fma(a.s0, b.s0, c); \
- c = fma(a.s1, b.s1, c); \
- })
-#define ARM_DOT3(a, b, c) \
- ({ \
- ARM_DOT2(a, b, c); \
- c = fma((a.s2), (b.s2), c); \
- })
-#define ARM_DOT4(a, b, c) \
- ({ \
- ARM_DOT3(a, b, c); \
- c = fma((a.s3), (b.s3), c); \
- })
-#define ARM_DOT8(a, b, c) \
- ({ \
- ARM_DOT4((a.lo), (b.lo), c); \
- ARM_DOT4((a.hi), (b.hi), c); \
- })
-#define ARM_DOT16(a, b, c) \
- ({ \
- ARM_DOT8((a.lo), (b.lo), c); \
- ARM_DOT8((a.hi), (b.hi), c); \
- })
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(k0, a, b, c) \
- ({ \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##0), (c.s0)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##1), (c.s1)); \
- })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(k0, a, b, c) \
- ({ \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##0), (c.s0)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##1), (c.s1)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##2), (c.s2)); \
- })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(k0, a, b, c) \
- ({ \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##0), (c.s0)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##1), (c.s1)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##2), (c.s2)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##3), (c.s3)); \
- })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(k0, a, b, c) \
- ({ \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##0), (c.s0)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##1), (c.s1)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##2), (c.s2)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##3), (c.s3)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##4), (c.s4)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##5), (c.s5)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##6), (c.s6)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##7), (c.s7)); \
- })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(k0, a, b, c) \
- ({ \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##0), (c.s0)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##1), (c.s1)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##2), (c.s2)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##3), (c.s3)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##4), (c.s4)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##5), (c.s5)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##6), (c.s6)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##7), (c.s7)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##8), (c.s8)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##9), (c.s9)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##A), (c.sA)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##B), (c.sB)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##C), (c.sC)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##D), (c.sD)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##E), (c.sE)); \
- CONCAT(ARM_DOT, k0) \
- ((a), (b##F), (c.sF)); \
- })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
- const bool cond_y = y == 0;
- const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
- // Compute RHS reshaped matrix address
- uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
- // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
-
- int i = 0;
- for(; i <= (K - K0); i += K0)
- {
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
- // Load values from RHS reshaped matrix
- LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
- // Accumulate
- ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
- lhs_offset += K0 * sizeof(DATA_TYPE);
- rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
- }
-
- // Left-over accumulations
- for(; i < K; ++i)
- {
- // Load values from LHS matrix
- LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
- // Load values from RHS reshaped matrix
- LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
- // Accumulate
- ARM_DOT_K0XN0(1, a0, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(1, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(1, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(1, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(1, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(1, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(1, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(1, a7, b, c7);
-#endif // M0 > 7
-
- lhs_offset += sizeof(DATA_TYPE);
- rhs_offset += sizeof(DATA_TYPE);
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M Number of rows in LHS matrix not reshaped.
- * @param[in] N Number of columns in RHS matrix not reshaped.
- * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- __read_only image2d_t rhs_img,
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
- const uint LEFTOVER_K = K % K0;
-
- // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
- const bool cond_y = y == 0;
- const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else // defined(MATRIX_B_DEPTH)
- const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
- // Compute RHS matrix coordinates
- uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
- const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
- // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
- int i = 0;
- for(; i <= (K - K0); i += K0)
- {
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
- // Load values from RHS matrix stored in a cl_image
- REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
- LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
- // Accumulate
- ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
- lhs_offset += K0 * sizeof(DATA_TYPE);
- x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
- }
-
- if(LEFTOVER_K != 0)
- {
- // Note: We cannot read out-of-bound elements from the RHS matrix because
- // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
- union UNION_VEC_TYPE
- {
- DATA_TYPE s[K0];
- VEC_DATA_TYPE(DATA_TYPE, K0)
- v;
- };
-
- union UNION_VEC_TYPE a0 = {.v = 0 };
-#if M0 > 1
- union UNION_VEC_TYPE a1 = {.v = 0 };
-#endif // M0 > 1
-#if M0 > 2
- union UNION_VEC_TYPE a2 = {.v = 0 };
-#endif // M0 > 2
-#if M0 > 3
- union UNION_VEC_TYPE a3 = {.v = 0 };
-#endif // M0 > 3
-#if M0 > 4
- union UNION_VEC_TYPE a4 = {.v = 0 };
-#endif // M0 > 4
-#if M0 > 5
- union UNION_VEC_TYPE a5 = {.v = 0 };
-#endif // M0 > 5
-#if M0 > 6
- union UNION_VEC_TYPE a6 = {.v = 0 };
-#endif // M0 > 6
-#if M0 > 7
- union UNION_VEC_TYPE a7 = {.v = 0 };
-#endif // M0 > 7
-
- REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-
- // Load from RHS matrix
- LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
- // Load from LHS matrix
- for(int k = 0; k < LEFTOVER_K; ++k)
- {
- a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
-#if M0 > 1
- a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
-#endif // M0 > 1
-#if M0 > 2
- a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
-#endif // M0 > 2
-#if M0 > 3
- a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
-#endif // M0 > 3
-#if M0 > 4
- a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
-#endif // M0 > 4
-#if M0 > 5
- a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
-#endif // M0 > 5
-#if M0 > 6
- a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
-#endif // M0 > 6
-#if M0 > 7
- a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
-#endif // M0 > 7
-
- lhs_offset += sizeof(DATA_TYPE);
- }
-
- // Accumulate
- ARM_DOT_K0XN0(K0, a0.v, b, c0);
-#if M0 > 1
- ARM_DOT_K0XN0(K0, a1.v, b, c1);
-#endif // M0 > 1
-#if M0 > 2
- ARM_DOT_K0XN0(K0, a2.v, b, c2);
-#endif // M0 > 2
-#if M0 > 3
- ARM_DOT_K0XN0(K0, a3.v, b, c3);
-#endif // M0 > 3
-#if M0 > 4
- ARM_DOT_K0XN0(K0, a4.v, b, c4);
-#endif // M0 > 4
-#if M0 > 5
- ARM_DOT_K0XN0(K0, a5.v, b, c5);
-#endif // M0 > 5
-#if M0 > 6
- ARM_DOT_K0XN0(K0, a6.v, b, c6);
-#endif // M0 > 6
-#if M0 > 7
- ARM_DOT_K0XN0(K0, a7.v, b, c7);
-#endif // M0 > 7
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
- // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#define VFMA(a, b, c) \
- ({ \
- c = fma(a, b, c); \
- })
-
-#if M0 == 1
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- })
-#elif M0 == 2 // M0 == 2
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- })
-#elif M0 == 3 // M0 == 3
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- })
-#elif M0 == 4 // M0 == 4
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- })
-#elif M0 == 5 // M0 == 5
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- })
-#elif M0 == 6 // M0 == 6
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- })
-#elif M0 == 7 // M0 == 7
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- })
-#elif M0 == 8 // M0 == 8
-#define VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
- })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M Number of rows in LHS matrix not reshaped.
- * @param[in] N Number of columns in RHS matrix not reshaped.
- * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
- const bool cond_y = y == 0;
- const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
- // Compute RHS reshaped matrix address
- uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
-
- int i = 0;
- for(; i <= (K - K0); i += K0)
- {
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
-
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(0, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(4, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(5, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(6, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(8, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(9, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(A, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(B, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(C, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(D, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(E, a, b0, c);
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
- lhs_offset += K0 * sizeof(DATA_TYPE);
- rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
- }
-
- // Left-over accumulations
- for(; i < K; ++i)
- {
- // Load values from LHS matrix
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
-
- b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
- VFMA_M0xN0(0, a, b0, c);
-
- lhs_offset += sizeof(DATA_TYPE);
- rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M Number of rows in LHS matrix not reshaped.
- * @param[in] N Number of columns in RHS matrix not reshaped.
- * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
- __read_only image2d_t rhs_img,
-#if defined(BETA)
- IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
- IMAGE_DECLARATION(dst),
- // Post Op arguments
- IMAGE_DECLARATION(eltwise_operand),
- uint lhs_stride_z,
- uint rhs_stride_z,
-#if defined(BETA)
- uint bias_stride_z,
-#endif //defined(BETA)
- uint dst_stride_z,
- uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- ,
- const int M,
- const int N,
- const int K)
-{
- // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
- // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
- // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
- const bool cond_y = y == 0;
- const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
- if((x * N0 >= N) || (y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else // defined(MATRIX_B_DEPTH)
- const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
- // Compute RHS matrix coordinates
- uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
- const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
- REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
- // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
- int i = 0;
- for(; i <= (K - K0); i += K0)
- {
- // Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
-
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(0, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(4, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(5, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(6, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(8, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(9, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(A, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(B, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(C, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(D, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(E, a, b0, c);
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
- VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
- lhs_offset += K0 * sizeof(DATA_TYPE);
- x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
- }
-
- // Left-over accumulations
- for(; i < K; ++i)
- {
- // Load values from LHS matrix
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b0;
- b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
- VFMA_M0xN0(0, a, b0, c);
-
- lhs_offset += sizeof(DATA_TYPE);
- x_rhs += RHS_STEP_X;
- }
-
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
- REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
- // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
- // c = act(c)
- POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
- // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
- POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
- // c = act(c)
- POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
deleted file mode 100644
index b584251c2a..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** (EXPERIMENTAL_POST_OPS) Macros for (binary) elementwise operations */
-
-/** List of (binary) elementwise operators, accounting for the argument position of argument X
- * @note X_Pos denotes the position of argument X. e.g. X_POS_0 means X is in the first place whereas X_POS_1 means X is in the second place
- * @name elementwise_post_ops
- * @{
- */
-#if defined(N0) && !defined(VEC_SIZE)
-#define VEC_SIZE N0
-#endif // defined(N0) && !defined(VEC_SIZE)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-
-#define ADD_X_POS_0(x, y) (x) + (y)
-#define SUB_X_POS_0(x, y) (x) - (y)
-#define MAX_X_POS_0(x, y) max(x, y)
-#define MIN_X_POS_0(x, y) min(x, y)
-#define SQUARED_DIFF_X_POS_0(x, y) (x - y) * (x - y)
-#define POWER_X_POS_0(x, y) pow(x, y)
-#if VEC_SIZE == 1
-#define PRELU_X_POS_0(x, y) (x > 0 ? x : x * y)
-#else // VEC_SIZE == 1
-
-#if defined(MIXED_PRECISION)
-#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
-#else // MIXED_PRECISION
-#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
-#endif // MIXED_PRECISION
-
-#endif // VEC_SIZE == 1
-#define DIV_X_POS_0(x, y) (x / y)
-#define AND_X_POS_0(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
-#define OR_X_POS_0(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
-
-#define ADD_X_POS_1(x, y) ADD_X_POS_0(x, y)
-#define SUB_X_POS_1(x, y) (y) - (x)
-#define MAX_X_POS_1(x, y) MAX_X_POS_0(x, y)
-#define MIN_X_POS_1(x, y) MIN_X_POS_0(x, y)
-#define SQUARED_DIFF_X_POS_1(x, y) SQUARED_DIFF_X_POS_0(x, y)
-#define POWER_X_POS_1(x, y) pow(y, x)
-#if VEC_SIZE == 1
-#define PRELU_X_POS_1(x, y) (y > 0 ? y : y * x)
-#else // VEC_SIZE == 1
-
-#if defined(MIXED_PRECISION)
-#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
-#else // MIXED_PRECISION
-#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
-#endif // MIXED_PRECISION
-
-#endif // VEC_SIZE == 1
-#define DIV_X_POS_1(x, y) (y / x)
-#define AND_X_POS_1(x, y) AND_X_POS_0(x, y)
-#define OR_X_POS_1(x, y) OR_X_POS_0(x, y)
-
-// By default use the order of the arguments as they are passed in, ie. _X_POS_0
-#define ADD(x, y) ADD_X_POS_0(x, y)
-#define SUB(x, y) SUB_X_POS_0(x, y)
-#define MAX(x, y) MAX_X_POS_0(x, y)
-#define MIN(x, y) MIN_X_POS_0(x, y)
-#define SQUARED_DIFF(x, y) SQUARED_DIFF_X_POS_0(x, y)
-#define POWER(x, y) POWER_X_POS_0(x, y)
-#define PRELU(x, y) PRELU_X_POS_0(x, y)
-#define DIV(x, y) DIV_X_POS_0(x, y)
-#define AND(x, y) AND_X_POS_0(x, y)
-#define OR(x, y) OR_X_POS_0(x, y)
-
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
-/** @} */ // end of group elementwise_post_ops
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name ELTWISE_OP_ROW_n
- *
- * @param[in] OP The elementwise post op
- * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \
- OPERAND1##0 = OP(OPERAND1##0, OPERAND2##0);
-
-#define ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \
- OPERAND1##1 = OP(OPERAND1##1, OPERAND2##1);
-
-#define ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \
- OPERAND1##2 = OP(OPERAND1##2, OPERAND2##2);
-
-#define ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \
- OPERAND1##3 = OP(OPERAND1##3, OPERAND2##3);
-
-#define ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \
- OPERAND1##4 = OP(OPERAND1##4, OPERAND2##4);
-
-#define ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \
- OPERAND1##5 = OP(OPERAND1##5, OPERAND2##5);
-
-#define ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \
- OPERAND1##6 = OP(OPERAND1##6, OPERAND2##6);
-
-#define ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \
- OPERAND1##7 = OP(OPERAND1##7, OPERAND2##7);
-
-#define ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \
- OPERAND1##8 = OP(OPERAND1##8, OPERAND2##8);
-
-#define ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \
- OPERAND1##9 = OP(OPERAND1##9, OPERAND2##9);
-
-#define ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \
- OPERAND1##A = OP(OPERAND1##A, OPERAND2##A);
-
-#define ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \
- OPERAND1##B = OP(OPERAND1##B, OPERAND2##B);
-
-#define ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \
- OPERAND1##C = OP(OPERAND1##C, OPERAND2##C);
-
-#define ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \
- OPERAND1##D = OP(OPERAND1##D, OPERAND2##D);
-
-#define ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \
- OPERAND1##E = OP(OPERAND1##E, OPERAND2##E);
-
-#define ELTWISE_OP_ROW_16(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \
- OPERAND1##F = OP(OPERAND1##F, OPERAND2##F);
-
-/** @} */ // end of group ELTWISE_OP_ROW_n
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name ELTWISE_OP_BLOCK
- *
- * Supported cases are N=1,2,3,...,16
- *
- * @param[in] OP The elementwise post op
- * @param[in] N The number of vectors in the block
- * @param[in] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_##N(OP, OPERAND1, OPERAND2)
-#define ELTWISE_OP_BLOCK(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2)
-/** @} */ // end of group ELTWISE_OP_BLOCK
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
- * @name ELTWISE_OP_ROW_BROADCAST_n
- *
- * @param[in] OP The elementwise post op
- * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the broadcast operand 2 variables
- * @{
- */
-#define ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \
- OPERAND1##0 = OP(OPERAND1##0, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \
- OPERAND1##1 = OP(OPERAND1##1, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \
- OPERAND1##2 = OP(OPERAND1##2, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \
- OPERAND1##3 = OP(OPERAND1##3, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \
- OPERAND1##4 = OP(OPERAND1##4, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \
- OPERAND1##5 = OP(OPERAND1##5, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \
- OPERAND1##6 = OP(OPERAND1##6, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \
- OPERAND1##7 = OP(OPERAND1##7, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \
- OPERAND1##8 = OP(OPERAND1##8, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \
- OPERAND1##9 = OP(OPERAND1##9, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \
- OPERAND1##A = OP(OPERAND1##A, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \
- OPERAND1##B = OP(OPERAND1##B, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \
- OPERAND1##C = OP(OPERAND1##C, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \
- OPERAND1##D = OP(OPERAND1##D, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \
- OPERAND1##E = OP(OPERAND1##E, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_16(OP, OPERAND1, OPERAND2) \
- ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \
- OPERAND1##F = OP(OPERAND1##F, OPERAND2);
-
-/** @} */ // end of group ELTWISE_OP_ROW_BROADCAST_n
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
- * @name ELTWISE_OP_BLOCK_BROADCAST
- * @note Only support:
- * case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX];
- * case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar);
- * Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1];
- *
- * Supported cases are N=1,2,3,...,16
- *
- * @param[in] OP The elementwise post op
- * @param[in] N The number of vectors in the block
- * @param[in] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_BROADCAST_##N(OP, OPERAND1, OPERAND2)
-#define ELTWISE_OP_BLOCK_BROADCAST(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2)
-/** @} */ // end of group ELTWISE_OP_BLOCK_BROADCAST \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
deleted file mode 100644
index e107f4452d..0000000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h"
-#include "gemm_helpers.h"
-#include "load_store_utility.h"
-
-/** (EXPERIMENTAL_POST_OPS) Convenience macros for automatically handling mixed precision (fp16 and fp32) operations
- * -DMIXED_PRECISION toggles mixed precision mode
- */
-
-/** Mixed-Precision-Aware Activation Block
- * @name MIXED_PRECISION_ACTIVATION_BLOCK
- * params N ... B_VAL: same as those in @ref ACTIVATION_BLOCK
- *
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
- ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME, A_VAL, B_VAL);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
- ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL);
-#endif // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ACTIVATION_BLOCK
-
-/** Mixed-Precision-Aware Elementwise Op Block
- * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name MIXED_PRECISION_ELTWISE_OP_BLOCK
- *
- * @param[in] OP The elementwise post op
- * @param[in] M0 The number of consecutive rows
- * @param[in] N0 The number of consecutive columns
- * @param[in] OPERAND1 The basename of the first and result operand variables
- * @param[in] OPERAND2 The basename of the second operand variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] CONVERTED_OPERAND2 The basename of the second operand variables converted to higher-precision in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
- CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2); \
- ELTWISE_OP_BLOCK(OP, M0, OPERAND1, CONVERTED_OPERAND2);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
- ELTWISE_OP_BLOCK(OP, M0, OPERAND1, OPERAND2);
-#endif // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK
-
-/** Mixed-Precision-Aware Elementwise Op Broadcast Block
- * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
- * @note Only support:
- * case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX]; this means @p N0 > 1
- * case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar) ; this means @p N0 == 1
- * Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1]; this means @p M0 should never == 1
- *
- * @param[in] OP The elementwise post op
- * @param[in] M0 The number of consecutive rows, > 1
- * @param[in] N0 The number of consecutive columns, >= 1
- * @param[in] OPERAND1 The basename of the first and result operand variables
- * @param[in] OPERAND2 The basename of the second operand variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] CONVERTED_OPERAND2 The basename of the second operand variables converted to higher-precision in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
- CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2); \
- ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, CONVERTED_OPERAND2##0);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
- ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, OPERAND2##0);
-#endif // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
-
-/** Mixed-Precision-Aware Boundary-Aware Store Block
- * @name MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE
- * params M0 ... PARTIAL_COND_X, same as those in STORE_BLOCK_BOUNDARY_AWARE
- *
- * @param[in] BASENAME_LP The name of the low precision variables, converted from BASENAME, in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
- CONVERT_BLOCK(M0, N0, DATA_TYPE, BASENAME, BASENAME_LP); \
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME_LP, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
-#endif // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index a32301d8e3..0c30c0e626 100644
--- a/src/core/CL/cl_kernels/common/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -152,7 +152,6 @@
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
* @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
@@ -453,7 +452,6 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -887,7 +885,6 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
* @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
@@ -1213,7 +1210,6 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -1713,7 +1709,6 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
*
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
@@ -1993,7 +1988,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
*
* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
@@ -2380,7 +2374,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
*
* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -2767,7 +2760,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
*
* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
@@ -3226,7 +3218,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS matrix is NOT reshaped
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
*
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
* @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
diff --git a/src/core/experimental/PostOpUtils.h b/src/core/experimental/PostOpUtils.h
deleted file mode 100644
index 6217dcc3da..0000000000
--- a/src/core/experimental/PostOpUtils.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2021, 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
-#define ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
-
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
-
-#include "arm_compute/core/experimental/Types.h"
-#include "support/Cast.h"
-
-#include <vector>
-
-/** (EXPERIMENTAL_POST_OPS) */
-namespace arm_compute
-{
-namespace experimental
-{
-/** Transform a PostOpList of type FromTensorT to one of type ToTensorT */
-template <typename FromTensorT, typename ToTensorT>
-PostOpList<ToTensorT> transform_post_op_list_arguments(const PostOpList<FromTensorT> &post_ops, std::function<ToTensorT(FromTensorT)> transform_arg)
-{
- PostOpList<ToTensorT> transformed_post_ops;
- for(const auto &post_op : post_ops.get_list())
- {
- switch(post_op->type())
- {
- case PostOpType::Activation:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const PostOpAct<FromTensorT> *>(post_op.get());
- transformed_post_ops.template push_back_op<PostOpAct<ToTensorT>>(_post_op->_act_info);
- break;
- }
- case PostOpType::Eltwise_Add:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwiseAdd<FromTensorT> *>(post_op.get());
- transformed_post_ops.template push_back_op<PostOpEltwiseAdd<ToTensorT>>(transform_arg(_post_op->_addend), _post_op->_prev_dst_pos, _post_op->_policy);
- break;
- }
- case PostOpType::Eltwise_PRelu:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwisePRelu<FromTensorT> *>(post_op.get());
- transformed_post_ops.template push_back_op<PostOpEltwisePRelu<ToTensorT>>(transform_arg(_post_op->_alpha_param), _post_op->_prev_dst_pos, _post_op->_policy);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported PostOpType");
- }
- }
- }
- return transformed_post_ops;
-}
-
-/** Get post op argument TensorType from post op argument index in a flattened, ordered post op argument list */
-inline TensorType get_post_op_arg_type(size_t index)
-{
- ARM_COMPUTE_ERROR_ON_MSG(static_cast<int>(index) > EXPERIMENTAL_ACL_POST_OP_ARG_LAST - EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, "Post Op argument index is out of range");
- return static_cast<TensorType>(EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + static_cast<int>(index));
-}
-
-/** Get a sequence of PostOp Types from PostOpList */
-template <typename T>
-PostOpTypeSequence get_post_op_sequence(const PostOpList<T> &post_ops)
-{
- PostOpTypeSequence post_op_sequence;
- for(const auto &op : post_ops.get_list())
- {
- post_op_sequence.push_back(op->type());
- }
- return post_op_sequence;
-}
-
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index d11e4f0b24..39b410d609 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -107,7 +107,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
// Create GEMMInfo structure
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format);
+ false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
// Supported activations in GEMM
const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
@@ -156,8 +156,8 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
_mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
- _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info,
- experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format));
+ _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format,
+ weight_format));
auto mm_mem_req = _mm_gemmlowp->workspace();
for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
@@ -188,7 +188,7 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
// Create GEMMInfo structure
const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format);
+ false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
if(is_quantized)
{
@@ -422,7 +422,7 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo
const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weights_info.weight_format());
+ false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
}
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index de2e9f9742..e4a3d30b6d 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -275,23 +275,14 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
{ "gemm_mm_native", "common/gemm.cl" },
{ "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" },
{ "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" },
- { "gemm_mm_native_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl" },
{ "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
{ "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
{ "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
{ "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
- { "gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
- { "gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
- { "gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
- { "gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
{ "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
{ "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
{ "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
{ "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
- { "gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
- { "gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
- { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
- { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
{ "gemm_lc_vm_f32", "common/gemm.cl" },
{ "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
{ "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
@@ -623,26 +614,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
#include "./cl_kernels/common/gemm_utils.clembed"
},
{
- "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.hembed"
- },
- {
- "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.hembed"
- },
- {
- "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.clembed"
- },
- {
- "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.clembed"
- },
- {
- "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.clembed"
- },
- {
"common/gemmlowp.cl",
#include "./cl_kernels/common/gemmlowp.clembed"
},
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index 5fea097ae3..b8997dfc7f 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
@@ -31,11 +30,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/AccessWindowStatic.h"
#include "src/core/CL/CLUtils.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/float_ops.h"
@@ -52,25 +51,6 @@ namespace
{
using ElementsProcessed = Steps;
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
- // PostOp sequence -> {Kernel Postfix, PostOp Slots}
- { {}, { "", {} } },
- { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
- { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
- { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
- { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
- { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const GEMMKernelInfo &gemm_info)
@@ -90,7 +70,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
"Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
const unsigned int m = gemm_info.m;
const unsigned int n = gemm_info.n;
@@ -133,7 +112,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
}
return Status{};
@@ -240,7 +218,6 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
_reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
_use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
_add_bias = src2 != nullptr;
- _num_post_op_args = gemm_info.post_ops.total_num_arguments();
// In case both input and dst have to be reinterpreted as 3D tensors,
// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -298,20 +275,11 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- // If post_ops are used, then we disable the use of gemm_info.activation_info
- if(gemm_info.post_ops.size() > 0)
- {
- post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
- }
- else
- {
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
- }
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
std::string kernel_name("gemm_mm_native");
- post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
// A macro guard to compile ONLY the kernel of interest
build_opts.add_option("-D" + upper_string(kernel_name));
@@ -396,11 +364,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
unsigned int idx0;
if(_add_bias)
{
- idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (7 + _num_post_op_args);
+ idx0 = 4 * num_arguments_per_2D_tensor() + 7;
}
else
{
- idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (6 + _num_post_op_args);
+ idx0 = 3 * num_arguments_per_2D_tensor() + 6;
}
const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -412,11 +380,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
unsigned int idx0;
if(_add_bias)
{
- idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+ idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0);
}
else
{
- idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+ idx0 = 3 * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0);
}
const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -440,12 +408,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
add_2D_tensor_argument(idx, src2, slice);
}
add_2D_tensor_argument(idx, dst, slice);
- // post op argument buffers
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- add_2D_tensor_argument(idx, post_op_arg, slice);
- }
+
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
if(_add_bias)
@@ -453,12 +416,6 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
}
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
- // post op argument stride_z
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
- }
// Pass m, n and k at runtime
_kernel.setArg<cl_int>(idx++, _m);
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index e478df727a..80f8355932 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
#include "src/core/common/Macros.h"
@@ -76,17 +76,16 @@ public:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
private:
- bool _slide_matrix_b{ true };
- bool _reinterpret_input_as_3d{ false };
- bool _reinterpret_output_as_3d{ false };
- bool _use_dummy_work_items{ false };
- bool _add_bias{ false };
- signed int _m{ 1 };
- signed int _n{ 1 };
- signed int _k{ 1 };
- unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+ bool _slide_matrix_b{ true };
+ bool _reinterpret_input_as_3d{ false };
+ bool _reinterpret_output_as_3d{ false };
+ bool _use_dummy_work_items{ false };
+ bool _add_bias{ false };
+ signed int _m{ 1 };
+ signed int _n{ 1 };
+ signed int _k{ 1 };
};
} // namespace kernels
} // namespace opencl
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index f14a6f1900..d72d29ea1e 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
@@ -31,11 +30,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/CL/CLUtils.h"
#include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/float_ops.h"
@@ -53,25 +52,6 @@ namespace
{
using ElementsProcessed = Steps;
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
- // PostOp sequence -> {Kernel Postfix, PostOp Slots}
- { {}, { "", {} } },
- { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
- { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
- { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
- { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
- { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const GEMMKernelInfo &gemm_info)
@@ -95,7 +75,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
"Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
const unsigned int m = gemm_info.m;
const unsigned int n = gemm_info.n;
@@ -139,7 +118,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
}
return Status{};
@@ -202,7 +180,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
_use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
_add_bias = src2 != nullptr;
_export_to_cl_image = rhs_info.export_to_cl_image;
- _num_post_op_args = gemm_info.post_ops.total_num_arguments();
// Check if we need to slide the matrix B
const unsigned int num_dimensions_src0 = src0->num_dimensions();
@@ -260,23 +237,14 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- // If post_ops are used, then we disable the use of gemm_info.activation_info
- if(gemm_info.post_ops.size() > 0)
- {
- post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
- }
- else
- {
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
- }
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
std::string kernel_name("gemm_mm_reshaped_");
kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
- post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
// A macro guard to compile ONLY the kernel of interest
build_opts.add_option("-D" + upper_string(kernel_name));
@@ -395,13 +363,6 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
// dst buffer
add_2D_tensor_argument(idx, dst, slice);
- // post op argument buffers
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- add_2D_tensor_argument(idx, post_op_arg, slice);
- }
-
// LHS stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
@@ -417,12 +378,6 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
// dst stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
- // post op argument stride_z
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
- }
// Cross-plan padding (if _reinterpret_output_as_3d = true)
if(_reinterpret_output_as_3d)
{
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index 2d668b91a3..8d25412a40 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
#include "src/core/common/Macros.h"
#include "src/gpu/cl/ClCompileContext.h"
@@ -100,17 +100,16 @@ public:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
private:
- bool _slide_matrix_b{ true };
- bool _reinterpret_output_as_3d{ false };
- bool _use_dummy_work_items{ false };
- bool _add_bias{ false };
- bool _export_to_cl_image{ false };
- signed int _m{ 1 };
- signed int _n{ 1 };
- signed int _k{ 1 };
- unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+ bool _slide_matrix_b{ true };
+ bool _reinterpret_output_as_3d{ false };
+ bool _use_dummy_work_items{ false };
+ bool _add_bias{ false };
+ bool _export_to_cl_image{ false };
+ signed int _m{ 1 };
+ signed int _n{ 1 };
+ signed int _k{ 1 };
};
} // namespace kernels
} // namespace opencl
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H */ \ No newline at end of file
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index f780538f53..b34c17cda8 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -23,13 +23,12 @@
*/
#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/CL/CLUtils.h"
#include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/float_ops.h"
@@ -47,25 +46,6 @@ namespace
{
using ElementsProcessed = Steps;
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
- // PostOp sequence -> {Kernel Postfix, PostOp Slots}
- { {}, { "", {} } },
- { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
- { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
- { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
- { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
- { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
- { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
{
@@ -86,7 +66,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
"Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
const unsigned int m = gemm_info.m;
const unsigned int n = gemm_info.n;
@@ -132,7 +111,6 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
}
return Status{};
@@ -203,7 +181,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
_add_bias = src2 != nullptr;
_export_to_cl_image = rhs_info.export_to_cl_image;
_has_pad_y = gemm_info.has_pad_y;
- _num_post_op_args = gemm_info.post_ops.total_num_arguments();
auto padding_info = get_padding_info({ src0, src1, src2, dst });
@@ -270,22 +247,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
}
- // If post_ops are used, then we disable the use of gemm_info.activation_info
- if(gemm_info.post_ops.size() > 0)
- {
- post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
- }
- else
- {
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
- }
+
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
std::string kernel_name("gemm_mm_reshaped_only_rhs_");
kernel_name += rhs_info.transpose ? "t" : "nt";
kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
- post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
// A macro guard to compile ONLY the kernel of interest
build_opts.add_option("-D" + upper_string(kernel_name));
@@ -411,13 +380,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
// dst buffer
add_2D_tensor_argument(idx, dst, slice);
- // post op argument buffers
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- add_2D_tensor_argument(idx, post_op_arg, slice);
- }
-
// LHS stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
@@ -432,12 +394,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
// dst stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
- // post op argument stride_z
- for(size_t i = 0; i < _num_post_op_args; ++i)
- {
- const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
- }
// Cross-plan padding (if _reinterpret_input_as_3d = true)
if(_reinterpret_input_as_3d && _has_pad_y)
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index 00cdb299ce..471160c94b 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
#include "src/core/common/Macros.h"
#include "src/gpu/cl/ClCompileContext.h"
@@ -90,19 +90,18 @@ public:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
private:
- bool _slide_matrix_b{ true };
- bool _reinterpret_input_as_3d{ false };
- bool _reinterpret_output_as_3d{ false };
- bool _use_dummy_work_items{ false };
- bool _add_bias{ false };
- bool _export_to_cl_image{ false };
- bool _has_pad_y{ false };
- signed int _m{ 1 };
- signed int _n{ 1 };
- signed int _k{ 1 };
- unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+ bool _slide_matrix_b{ true };
+ bool _reinterpret_input_as_3d{ false };
+ bool _reinterpret_output_as_3d{ false };
+ bool _use_dummy_work_items{ false };
+ bool _add_bias{ false };
+ bool _export_to_cl_image{ false };
+ bool _has_pad_y{ false };
+ signed int _m{ 1 };
+ signed int _n{ 1 };
+ signed int _k{ 1 };
};
} // namespace kernels
} // namespace opencl
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index 51248d4a7a..eb9475ccaa 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -90,7 +90,6 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
case ConvolutionMethod::WINOGRAD:
{
ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
- ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
auto f = std::make_unique<ClWinogradConv2d>();
f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math);
_operator = std::move(f);
@@ -99,7 +98,6 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
case ConvolutionMethod::DIRECT:
{
ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
- ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
auto f = std::make_unique<ClDirectConv2d>();
f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
_operator = std::move(f);
@@ -108,7 +106,6 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
case ConvolutionMethod::INDIRECT:
{
ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
- ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
auto f = std::make_unique<ClIndirectConv2d>();
f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
_operator = std::move(f);
@@ -142,7 +139,6 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
{
//Validate Winograd
ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClWinogradConv2d does not support PostOps");
ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math));
break;
}
@@ -150,7 +146,6 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
{
// Validate direct convolution layer
ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClDirectConv2d does not support PostOps");
ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
break;
}
@@ -158,7 +153,6 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
{
// Validate indirect convolution layer
ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClIndirectConv2d does not support PostOps");
ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
break;
}
@@ -271,17 +265,17 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
if(is_data_type_float(src->data_type()))
{
// Get dst shape
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
- const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
- const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8;
- const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16;
- const bool is_ofm_lte_8 = weights->dimension(3U) <= 8;
- const bool is_ofm_lt_64 = weights->dimension(3U) < 64;
- const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
- const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U);
- const bool is_m_one = output_shape[1] * output_shape[2] == 1;
- const bool is_unit_stride = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
- const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h);
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+ const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+ const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8;
+ const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16;
+ const bool is_ofm_lte_8 = weights->dimension(3U) <= 8;
+ const bool is_ofm_lt_64 = weights->dimension(3U) < 64;
+ const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+ const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U);
+ const bool is_m_one = output_shape[1] * output_shape[2] == 1;
+ const bool is_unit_stride = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
+ const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h);
// Run Winograd if valid and IFM >= 8
if(is_wino_valid && is_ifm_ge_8)
@@ -330,7 +324,7 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
{
const bool is_kernel_sz_odd = kernel_sz % 2;
const bool is_g77 = gpu_target == GPUTarget::G77;
- preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
+ preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 ? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
}
// Direct/indirect convolution used for the first layer of the network
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 8db6dabe58..7e331a86f3 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -38,7 +38,6 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
-#include "arm_compute/core/experimental/IPostOp.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/core/utils/helpers/float_ops.h"
@@ -222,7 +221,6 @@ void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorIn
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
// Set the target for the kernels
_mm_native_kernel->set_target(gpu_target);
@@ -254,7 +252,6 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor
kernel_info.reinterpret_input_as_3d = false;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
// Set the target for the kernels
_reshape_lhs_kernel->set_target(gpu_target);
@@ -299,7 +296,6 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
// Set the target for the kernels
_mm_reshaped_only_rhs_kernel->set_target(gpu_target);
@@ -346,7 +342,6 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
// Set the target for the kernels
_mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
@@ -396,7 +391,6 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
@@ -433,7 +427,6 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con
kernel_info.reinterpret_input_as_3d = false;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
@@ -482,7 +475,6 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
@@ -531,7 +523,6 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
- kernel_info.post_ops = gemm_info.post_ops();
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
@@ -624,7 +615,12 @@ Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
// Select GEMMType
CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
{
- CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,
+ CLScheduler::get().target(),
+ a->data_type(),
+ m,
+ n,
+ k,
+ batch_size,
},
gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
index 682477e4ea..5620471ff9 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.cpp
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,14 +54,14 @@ namespace opencl
{
ClGemmConv2d::ClGemmConv2d()
: _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(),
- _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _use_post_ops(false), _aux_mem(AuxTensorIdx::Count)
+ _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
{
}
ClGemmConv2d::~ClGemmConv2d() = default;
void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
- int gemm_3d_depth, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
+ int gemm_3d_depth, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
@@ -76,14 +76,12 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
false, // fast_math
false, // fp_mixed_precision
true, // broadcast_bias
- act_info, // activation_info
- post_ops // post ops
+ act_info // activation_info
);
TensorInfo tmp_src{ *src };
if(_is_quantized)
{
- ARM_COMPUTE_ERROR_ON_MSG(post_ops.size() > 0, "ClGemmConv2d quantized types do not support post ops");
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
const QuantizationInfo input_quantization_info = src->quantization_info();
@@ -118,7 +116,7 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
}
Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
{
const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
@@ -132,13 +130,11 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
false, // fast_math
false, // fp_mixed_precision
true, // broadcast_bias
- act_info, // activation_info
- post_ops // post ops
+ act_info // activation_info
);
if(is_quantized)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(post_ops.size() > 0, "ClGemmConv2d quantized types do not support post ops");
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
const QuantizationInfo input_quantization_info = src->quantization_info();
@@ -189,19 +185,18 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
// Only for quantize there are few cases where we cannot fuse the activation function in GEMM
_fuse_activation = true;
- _use_post_ops = conv2d_info.post_ops.size() > 0;
const ITensorInfo *gemm_input_to_use = src;
ITensorInfo *gemm_output_to_use = dst;
// Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride();
// Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
src->dimension(idx_height),
kernel_width,
@@ -318,11 +313,10 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info, conv2d_info.post_ops);
+ configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
if(!_skip_col2im)
{
- ARM_COMPUTE_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClGemmConv2d does not support post ops with col2im operation"); // Post ops must be performed after every other op
// Set the GPU target for col2im
_col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
_col2im_kernel->set_target(CLScheduler::get().target());
@@ -334,8 +328,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
"Output shape does not match the expected one");
- // Disable running of activation kernel if post ops are used
- if(!_fuse_activation && !_use_post_ops)
+ if(!_fuse_activation)
{
_activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
_activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
@@ -383,15 +376,11 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1
&& conv2d_info.conv_info.stride().second == 1);
- const bool skip_col2im = data_layout == DataLayout::NHWC;
- bool fuse_activation = true;
- bool use_post_ops = conv2d_info.post_ops.size() > 0;
+ const bool skip_col2im = data_layout == DataLayout::NHWC;
+ bool fuse_activation = true;
ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!skip_im2col
- && conv2d_info.post_ops.size() > 0,
- "ClGemmConv2d does not support post ops with col2im or im2col operation"); // Post ops must be performed after every other op
// Validate biases
if(biases != nullptr)
@@ -520,8 +509,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info,
- conv2d_info.post_ops));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
// Validate Col2Im
if(!skip_col2im)
@@ -530,8 +518,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
}
// Validate Activation Layer
- // Disable running (thus validation) of activation kernel if post ops are used
- if(!fuse_activation && !use_post_ops)
+ if(!fuse_activation)
{
ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
}
@@ -600,8 +587,7 @@ void ClGemmConv2d::run(ITensorPack &tensors)
}
//Run Activation Layer if we cannot fuse in GEMM
- // Disable running of activation kernel if post ops are used
- if(!_fuse_activation && !_use_post_ops)
+ if(!_fuse_activation)
{
ITensorPack pack =
{
@@ -620,7 +606,7 @@ void ClGemmConv2d::prepare(ITensorPack &tensors)
ICLTensor *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- ITensorPack pack =
+ ITensorPack pack =
{
{ TensorType::ACL_SRC, weights },
{ TensorType::ACL_DST, weights_reshaped.get() }
diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
index afde7c511d..8a46ee2dc3 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.h
+++ b/src/gpu/cl/operators/ClGemmConv2d.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,12 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_GEMM_CONV2D_H
-#define ARM_COMPUTE_CL_GEMM_CONV2D_H
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
#include "src/gpu/cl/ClCompileContext.h"
#include "src/gpu/cl/IClOperator.h"
@@ -113,8 +112,8 @@ public:
const WeightsInfo &weights_info = WeightsInfo());
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
experimental::MemoryRequirements workspace() const override;
private:
@@ -133,7 +132,7 @@ private:
*/
void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
- int gemm_3d_depth, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+ int gemm_3d_depth, const ActivationLayerInfo &act_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
*
* @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -150,7 +149,7 @@ private:
* @return a status
*/
static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
- int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+ int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
enum AuxTensorIdx
{
@@ -178,10 +177,9 @@ private:
bool _fuse_activation;
bool _append_bias;
bool _is_prepared;
- bool _use_post_ops;
experimental::MemoryRequirements _aux_mem;
};
} // namespace opencl
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_CONV2D_H */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
diff --git a/src/graph/DataLayerVisitor.cpp b/src/graph/DataLayerVisitor.cpp
index 85d24b4654..073ffd413d 100644
--- a/src/graph/DataLayerVisitor.cpp
+++ b/src/graph/DataLayerVisitor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -131,14 +131,6 @@ void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
add_convolution_layer_method<FusedConvolutionBatchNormalizationNode>(_layer_data, n);
}
-void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
- _layer_data.clear();
- add_generic_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
- add_convolution_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
- add_convolution_layer_method<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-}
-
void DataLayerVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
{
_layer_data.clear();
diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index e5b4adda26..70fe44e134 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018,2021 Arm Limited.
+ * Copyright (c) 2018,2021,2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,7 +37,6 @@ namespace graph
INode::INode()
: _graph(nullptr), _id(EmptyNodeID), _common_params({ "", Target::UNSPECIFIED}),
_outputs(), _input_edges(), _output_edges(), _assigned_target(Target::UNSPECIFIED)
- ,_post_op_info_list(std::list<std::unique_ptr<ConvPostOpInfo>> {})
{
}
// clang-format on
@@ -200,15 +199,5 @@ Target INode::assigned_target() const
{
return _assigned_target;
}
-
-const std::list<std::unique_ptr<ConvPostOpInfo>> &INode::post_op_info_list() const
-{
- return _post_op_info_list;
-}
-
-std::list<std::unique_ptr<ConvPostOpInfo>> &INode::post_op_info_list()
-{
- return _post_op_info_list;
-}
} // namespace graph
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/INodeVisitor.cpp b/src/graph/INodeVisitor.cpp
index f067d618bd..5369f6f539 100644
--- a/src/graph/INodeVisitor.cpp
+++ b/src/graph/INodeVisitor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -85,14 +85,6 @@ void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
{
default_visit(n);
}
-void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
- default_visit(n);
-}
-void DefaultNodeVisitor::visit(FusedConvolutionWithPostOpNode &n)
-{
- default_visit(n);
-}
void DefaultNodeVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
{
default_visit(n);
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index c67f6a538b..882810474e 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -274,8 +274,6 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
case NodeType::FusedConvolutionBatchNormalizationLayer:
return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
- case NodeType::FusedConvolutionWithPostOp:
- return detail::create_fused_convolution_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionWithPostOpNode *>(node), ctx);
case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
case NodeType::GenerateProposalsLayer:
@@ -318,8 +316,6 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
case NodeType::StridedSliceLayer:
return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
- case NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer:
- return detail::create_fused_convolution_batch_normalization_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationWithPostOpsNode *>(node), ctx);
default:
return nullptr;
}
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index c50782db48..8fd8c14f63 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -76,8 +76,6 @@ Status CLNodeValidator::validate(INode *node)
CLDirectConvolutionLayer,
CLGEMMConvolutionLayer,
CLWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
- case NodeType::FusedConvolutionWithPostOp:
- return detail::validate_fused_convolution_with_post_op<CLGEMMConvolutionLayer>(*polymorphic_downcast<FusedConvolutionWithPostOpNode *>(node));
case NodeType::DepthToSpaceLayer:
return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
case NodeType::DepthwiseConvolutionLayer:
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 8eb3e4cb71..38284b93cf 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -29,8 +29,6 @@
#include "arm_compute/graph/Utils.h"
#include "arm_compute/graph/backends/BackendRegistry.h"
#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
#include "arm_compute/graph/nodes/Nodes.h"
#include "src/graph/mutators/MutatorUtils.h"
@@ -333,441 +331,6 @@ void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse
}
}
-/** Check valid combinations:
- *
- * | Main operator | Post operators |
- * |:--------------|:---------------------------|
- * |conv | add |
- * |conv | act + add |
- * |conv | add + act |
- * |conv | act + add + act |
- *
-*/
-#define MAX_VALIDE_COMBINATION 4
-#define MAX_POST_OP_NUM 3
-NodeType valide_post_op_type[MAX_VALIDE_COMBINATION][MAX_POST_OP_NUM] = { { EltwiseLayerNode::node_type },
- { EltwiseLayerNode::node_type, ActivationLayerNode::node_type },
- { ActivationLayerNode::node_type, EltwiseLayerNode::node_type },
- { ActivationLayerNode::node_type, EltwiseLayerNode::node_type, ActivationLayerNode::node_type }
-};
-
-bool check_post_op_type(NodeType *post_op_type, int len)
-{
- if(len > MAX_POST_OP_NUM || len <= 0)
- {
- return false;
- }
-
- bool found = false;
- for(int i = 0; i < MAX_VALIDE_COMBINATION; ++i)
- {
- for(int j = 0; j < len; ++j)
- {
- if(post_op_type[j] != valide_post_op_type[i][j])
- {
- found = false;
- break;
- }
- found = true;
- }
- if(found)
- break;
- }
-
- return found;
-}
-
-void fuse_convolution_with_post_op(Graph &g, INode *fused_node, std::list<INode *> post_op_node_list, int prev_op_dst_pos)
-{
- unsigned int op_idx = 0;
- // Fuse post operators with conv
- for(const auto &post_op : post_op_node_list)
- {
- switch(post_op->type())
- {
- case EltwiseLayerNode::node_type:
- {
- auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op);
- ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
-
- fused_node->post_op_info_list().push_back(std::make_unique<ConvPostOpInfoEltwiseAdd>(prev_op_dst_pos, eltwise_node->convert_policy()));
- ARM_COMPUTE_LOG_GRAPH_VERBOSE(" with Elementwise Layer node with ID : " << post_op->id());
- break;
- }
- case ActivationLayerNode::node_type:
- {
- auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op);
- ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
-
- fused_node->post_op_info_list().push_back(std::make_unique<ConvPostOpInfoActivation>(act_node->activation_info()));
- ARM_COMPUTE_LOG_GRAPH_VERBOSE(" with Activation Layer node with ID : " << post_op->id());
- break;
- }
- default:
- {
- break;
- }
- }
-
- if(op_idx == post_op_node_list.size() - 1) // last fusable node
- {
- transfer_driving_nodes_and_remove_old_node(g, fused_node, post_op, true);
- }
- else
- {
- // Remove node
- g.remove_node(post_op->id());
- }
- op_idx++;
- }
-}
-
-std::list<INode *> get_post_op_list(Graph &g, int &eltwise_operand_id, int &prev_op_dst_pos, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
- std::list<INode *> post_op_node_list = {};
- NodeID prev_op_dst_id = conv_node_id;
- NodeType post_op_type_list[3] = { NodeType::Dummy, NodeType::Dummy, NodeType::Dummy };
- int post_op_idx = 0;
-
- // Get list of the connected nodes
- auto current_node = g.node(conv_node_id);
-
- while(post_op_node_list.size() < 3)
- {
- // This convolution node must have only one output edge, otherwise this function would not have been called
-
- auto current_output_edge_id = current_node->output_edges().begin();
- auto current_output_edge = g.edge(*current_output_edge_id);
- auto post_op_node = current_output_edge->consumer();
-
- bool fusable_post_op = false;
- if(post_op_node != nullptr && post_op_node->output_edges().size() > 0)
- {
- switch(post_op_node->type())
- {
- case EltwiseLayerNode::node_type:
- {
- auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op_node);
- ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
- if(eltwise_node->output(0)->accessor() == nullptr)
- {
- post_op_node_list.push_back(post_op_node);
- fusable_post_op = true;
- post_op_type_list[post_op_idx++] = eltwise_node->type();
-
- // Extract elementwise inputs
- const auto eltwise_input_id_0 = eltwise_node->input_edge(0)->producer_id();
- const auto eltwise_input_id_1 = eltwise_node->input_edge(1)->producer_id();
- if(eltwise_input_id_0 == prev_op_dst_id)
- {
- eltwise_operand_id = eltwise_input_id_1;
- prev_op_dst_pos = 0;
- }
- else if(eltwise_input_id_1 == prev_op_dst_id)
- {
- eltwise_operand_id = eltwise_input_id_0;
- prev_op_dst_pos = 1;
- }
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with elementwise due to the presence of an output accessor\n");
- }
- break;
- }
- case ActivationLayerNode::node_type:
- {
- auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op_node);
- ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
- // Check if activation is supported for fusion
- if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
- {
- break;
- }
- if(act_node->output(0)->accessor() == nullptr)
- {
- post_op_node_list.push_back(post_op_node);
- fusable_post_op = true;
- post_op_type_list[post_op_idx++] = act_node->type();
- prev_op_dst_id = act_node->id();
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
- }
- break;
- }
- default:
- {
- break;
- }
- }
-
- // Check if the node is not a branching node and current node is fusable
- if(post_op_node->output_edges().size() == 1 && fusable_post_op == true)
- {
- current_node = post_op_node;
- }
- else
- {
- break;
- }
- }
- }
-
- // Check whether it's valid post op list
- if(post_op_node_list.size() > 0)
- {
- bool fuse_with_post_op = check_post_op_type(post_op_type_list, post_op_node_list.size());
- if(!fuse_with_post_op)
- {
- post_op_node_list.clear();
- }
- }
-
- return post_op_node_list;
-}
-
-/** Fuse below operators:
- *
- * | Main operator | Post operators |
- * |:--------------|:---------------------------|
- * |conv | add |
- * |conv | act + add |
- * |conv | add + act |
- * |conv | act + add + act |
- *
- * Notes: currently, only GEMM supports fusion with post operator
-*/
-void fuse_convolution_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
- ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
-
- auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
- ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
-
- const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
- if(conv_algorithm != ConvolutionMethod::GEMM)
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
- return;
- }
-
- // Prevent fusion if fused node has an output accessor
- if(conv_node->output(0)->accessor() == nullptr)
- {
- // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
- const Edge *input_edge = conv_node->input_edge(1);
- if(input_edge != nullptr && input_edge->tensor() != nullptr)
- {
- const DataLayout data_layout = input_edge->tensor()->desc().layout;
- const DataType data_type = input_edge->tensor()->desc().data_type;
- const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
- if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
- return;
- }
- }
- else
- {
- return;
- }
-
- // Get post op list
- int eltwise_operand_id = 0;
- int prev_op_dst_pos = 0; // Previous operator dst's postion in current operator
- std::list<INode *> post_op_node_list = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
-
- if(post_op_node_list.size() == 0)
- {
- return;
- }
- else // Do convolution fusion with post op if there're one(elementwise), two or more operators
- {
- const Target assigned_target = conv_node->assigned_target();
-
- // Extract conv inputs
- const auto conv_input_id = conv_node->input_edge(0)->producer_id();
- const auto conv_weights_id = conv_node->input_edge(1)->producer_id();
- const auto conv_info = conv_node->convolution_info();
- const auto conv_method = conv_node->convolution_method();
- const auto num_groups = conv_node->num_groups();
- FastMathHint fast_math_hint = conv_node->fast_math_hint();
-
- // Create the fused node
- const NodeID fused_id = g.add_node<FusedConvolutionWithPostOpNode>(conv_info, num_groups, conv_method, fast_math_hint);
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << conv_node->id());
-
- // Add connections from the conv inputs to the fused node
- g.add_connection(conv_input_id, 0, fused_id, 0);
- g.add_connection(conv_weights_id, 0, fused_id, 1);
- if(conv_node->input_edge(2) != nullptr)
- {
- auto conv_bias_id = conv_node->input_edge(2)->producer_id();
- g.add_connection(conv_bias_id, 0, fused_id, 2);
- }
- // Adding the Element wise operand in case the post op is element wise operation
- auto it = std::find_if(post_op_node_list.begin(),
- post_op_node_list.end(),
- [&](const INode * nd)
- {
- return (nd->type() == graph::NodeType::EltwiseLayer);
- });
-
- if(it != post_op_node_list.end())
- {
- g.add_connection(eltwise_operand_id, 0, fused_id, 3);
- }
- g.remove_node(conv_node->id());
-
- // Update fused node outputs
- auto fused_node = g.node(fused_id);
- fused_node->set_assigned_target(assigned_target);
-
- // Fuse convolution with post op
- fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
-
- post_op_node_list.clear();
- ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
- }
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
- }
-}
-
-void fuse_convolution_batch_normalization_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
- ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
-
- auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(output_edge->producer());
- ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
- const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
- if(conv_algorithm != ConvolutionMethod::GEMM)
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
- return;
- }
-
- // Prevent fusion if fused node has an output accessor
- if(conv_node->output(0)->accessor() == nullptr)
- {
- // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
- const Edge *input_edge = conv_node->input_edge(1);
- if(input_edge != nullptr && input_edge->tensor() != nullptr)
- {
- const DataLayout data_layout = input_edge->tensor()->desc().layout;
- const DataType data_type = input_edge->tensor()->desc().data_type;
- const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
- if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
- return;
- }
- }
- else
- {
- return;
- }
-
- // Get post op list
- int eltwise_operand_id = 0;
- int prev_op_dst_pos = 0; // Previous operator dst's postion in current operator
- std::list<INode *> post_op_node_list = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
-
- if(post_op_node_list.size() == 0)
- {
- return;
- }
- else // Do convolution fusion with post op if there're one(elementwise), two or more operators
- {
- const Target assigned_target = conv_node->assigned_target();
-
- // Extract conv inputs
- const auto conv_input_id = conv_node->input_edge(0)->producer_id();
- const auto conv_weights_id = conv_node->input_edge(1)->producer_id();
- const auto bn_mean_id = conv_node->input_edge(3)->producer_id();
- const auto bn_var_id = conv_node->input_edge(4)->producer_id();
- const auto conv_info = conv_node->convolution_info();
- const auto conv_method = conv_node->convolution_method();
- const auto num_groups = conv_node->num_groups();
- FastMathHint fast_math_hint = conv_node->fast_math_hint();
-
- // Create the fused node
-
- const float epsilon = conv_node->epsilon();
- const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationWithPostOpsNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint);
-
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing FusedConvolutionBatchNormalization node with ID : " << conv_node->id());
-
- // Add connections from the conv inputs to the fused node
- g.add_connection(conv_input_id, 0, fused_id, 0);
- g.add_connection(conv_weights_id, 0, fused_id, 1);
-
- if(conv_node->input_edge(2) != nullptr)
- {
- auto conv_bias_id = conv_node->input_edge(2)->producer_id();
- g.add_connection(conv_bias_id, 0, fused_id, 2);
- }
- g.add_connection(bn_mean_id, 0, fused_id, 3);
- g.add_connection(bn_var_id, 0, fused_id, 4);
-
- // Move connections of old FusedConvolutionBatchNormalization to the fused node
- if(conv_node->input_edge(5) != nullptr)
- {
- const auto bn_beta_id = conv_node->input_edge(5)->producer_id();
- g.add_connection(bn_beta_id, 0, fused_id, 5);
- }
-
- if(conv_node->input_edge(6) != nullptr)
- {
- const auto bn_gamma_id = conv_node->input_edge(6)->producer_id();
- g.add_connection(bn_gamma_id, 0, fused_id, 6);
- }
-
- // Adding the Element wise operand in case the post op is element wise operation
- auto it = std::find_if(post_op_node_list.begin(),
- post_op_node_list.end(),
- [&](const INode * nd)
- {
- return (nd->type() == graph::NodeType::EltwiseLayer);
- });
-
- if(it != post_op_node_list.end())
- {
- g.add_connection(eltwise_operand_id, 0, fused_id, 7);
- }
-
- // Update fused node outputs
- auto fused_node = g.node(fused_id);
- fused_node->set_assigned_target(assigned_target);
-
- auto conv_node_name = conv_node->name();
-
- // collect the post ops names
- std::string post_ops_name = "";
- for(auto &post_op : post_op_node_list)
- {
- post_ops_name += post_op->name();
- }
- fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + post_ops_name, assigned_target });
-
- // Fuse convolution with post op
- fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
-
- post_op_node_list.clear();
- g.remove_node(conv_node->id());
- ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
- }
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
- }
-}
-
template <typename N1, typename F, typename... Args>
void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
{
@@ -839,10 +402,6 @@ void NodeFusionMutator::mutate(Graph &g)
detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
- // The fusion of PostOps to ConvolutionLayer:
- // It must occur after the fusion of PadLayer into ConvolutionLayer
- // It must occur before the fusion of normal ActivationLayer into ConvolutionLayer as it takes precedence
- detail::fuse_layer<ConvolutionLayerNode>(g, cl_target_prec, detail::fuse_convolution_with_post_ops, supported_fused_activations);
detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
@@ -851,7 +410,6 @@ void NodeFusionMutator::mutate(Graph &g)
// The fusion of BatchNormalizationLayer must occur after the fusion of ActivationLayer. Because FusedConvolutionBatchNormalizationNode assumes the BatchNormalization is already fused with activation, if any
detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
- detail::fuse_layer<FusedConvolutionBatchNormalizationNode>(g, cl_target_prec, detail::fuse_convolution_batch_normalization_with_post_ops, supported_fused_activations);
}
} // namespace graph
} // namespace arm_compute
diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
deleted file mode 100644
index af81f0369a..0000000000
--- a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-#include "arm_compute/graph/Utils.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-FusedConvolutionBatchNormalizationWithPostOpsNode::FusedConvolutionBatchNormalizationWithPostOpsNode(float epsilon, PadStrideInfo info,
- unsigned int num_groups,
- ConvolutionMethod method,
- FastMathHint fast_math_hint)
- : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint)
-{
- _input_edges.resize(8, EmptyEdgeID);
- _outputs.resize(1, NullTensorID);
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::set_convolution_method(ConvolutionMethod method)
-{
- _method = method;
-}
-
-float FusedConvolutionBatchNormalizationWithPostOpsNode::epsilon() const
-{
- return _epsilon;
-}
-
-ConvolutionMethod FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_method() const
-{
- return _method;
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::set_fast_math_hint(FastMathHint hint)
-{
- _fast_math_hint = hint;
-}
-
-FastMathHint FusedConvolutionBatchNormalizationWithPostOpsNode::fast_math_hint() const
-{
- return _fast_math_hint;
-}
-
-PadStrideInfo FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_info() const
-{
- return _info;
-}
-
-unsigned int FusedConvolutionBatchNormalizationWithPostOpsNode::num_groups() const
-{
- return _num_groups;
-}
-
-TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
- const TensorDescriptor &weights_descriptor,
- const PadStrideInfo &info)
-{
- unsigned int output_width = 0;
- unsigned int output_height = 0;
-
- const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
- const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
- const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
- const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
-
- std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
-
- const DataLayout data_layout = input_descriptor.layout;
- TensorDescriptor output_descriptor = input_descriptor;
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
-
- return output_descriptor;
-}
-
-bool FusedConvolutionBatchNormalizationWithPostOpsNode::forward_descriptors()
-{
- if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
- {
- Tensor *dst = output(0);
- ARM_COMPUTE_ERROR_ON(dst == nullptr);
- dst->desc() = configure_output(0);
- return true;
- }
- return false;
-}
-
-TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::configure_output(size_t idx) const
-{
- ARM_COMPUTE_UNUSED(idx);
- const Tensor *src = input(0);
- const Tensor *weights = input(1);
-
- ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
-
- TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-
- return output_info;
-}
-
-NodeType FusedConvolutionBatchNormalizationWithPostOpsNode::type() const
-{
- return FusedConvolutionBatchNormalizationWithPostOpsNode::node_type;
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::accept(INodeVisitor &v)
-{
- v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp b/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp
deleted file mode 100644
index 63341e2760..0000000000
--- a/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-#include "arm_compute/graph/Utils.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-FusedConvolutionWithPostOpNode::FusedConvolutionWithPostOpNode(PadStrideInfo info,
- unsigned int num_groups,
- ConvolutionMethod method,
- FastMathHint fast_math_hint,
- QuantizationInfo out_quant_info)
- : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(std::move(out_quant_info)), _fused_activation()
-{
- _input_edges.resize(4, EmptyEdgeID);
- _outputs.resize(1, NullTensorID);
-}
-
-void FusedConvolutionWithPostOpNode::set_convolution_method(ConvolutionMethod method)
-{
- _method = method;
-}
-
-ConvolutionMethod FusedConvolutionWithPostOpNode::convolution_method() const
-{
- return _method;
-}
-
-void FusedConvolutionWithPostOpNode::set_fast_math_hint(FastMathHint hint)
-{
- _fast_math_hint = hint;
-}
-
-FastMathHint FusedConvolutionWithPostOpNode::fast_math_hint() const
-{
- return _fast_math_hint;
-}
-
-PadStrideInfo FusedConvolutionWithPostOpNode::convolution_info() const
-{
- return _info;
-}
-
-unsigned int FusedConvolutionWithPostOpNode::num_groups() const
-{
- return _num_groups;
-}
-
-ActivationLayerInfo FusedConvolutionWithPostOpNode::fused_activation() const
-{
- return _fused_activation;
-}
-
-void FusedConvolutionWithPostOpNode::set_fused_activation(ActivationLayerInfo fused_activation)
-{
- _fused_activation = fused_activation;
-}
-
-void FusedConvolutionWithPostOpNode::set_convolution_info(PadStrideInfo info)
-{
- _info = info;
-}
-
-TensorDescriptor FusedConvolutionWithPostOpNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
- const TensorDescriptor &weights_descriptor,
- const PadStrideInfo &info)
-{
- unsigned int output_width = 0;
- unsigned int output_height = 0;
-
- const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
- const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
- const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
- const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
-
- std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
-
- const DataLayout data_layout = input_descriptor.layout;
- TensorDescriptor output_descriptor = input_descriptor;
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
- output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
-
- return output_descriptor;
-}
-
-bool FusedConvolutionWithPostOpNode::forward_descriptors()
-{
- if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
- {
- Tensor *dst = output(0);
- ARM_COMPUTE_ERROR_ON(dst == nullptr);
- dst->desc() = configure_output(0);
- return true;
- }
- return false;
-}
-
-TensorDescriptor FusedConvolutionWithPostOpNode::configure_output(size_t idx) const
-{
- ARM_COMPUTE_UNUSED(idx);
- const Tensor *src = input(0);
- const Tensor *weights = input(1);
-
- ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
-
- TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
- if(!_out_quant_info.empty())
- {
- output_info.quant_info = _out_quant_info;
- }
-
- return output_info;
-}
-
-NodeType FusedConvolutionWithPostOpNode::type() const
-{
- return FusedConvolutionWithPostOpNode::node_type;
-}
-
-void FusedConvolutionWithPostOpNode::accept(INodeVisitor &v)
-{
- v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index 1071d50197..9c7c4248bb 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -85,22 +85,6 @@ void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
_info = ss.str();
}
-void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
- ARM_COMPUTE_UNUSED(n);
- std::stringstream ss;
- ss << "FusedConvolutionBatchNormalizationWithPostOpsNode";
- _info = ss.str();
-}
-
-void DotGraphVisitor::visit(FusedConvolutionWithPostOpNode &n)
-{
- ARM_COMPUTE_UNUSED(n);
- std::stringstream ss;
- ss << "FusedConvolutionWithPostOpNode";
- _info = ss.str();
-}
-
void DotGraphVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
{
ARM_COMPUTE_UNUSED(n);
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 476bf27423..f3c05adb47 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -29,7 +29,6 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
#include "src/core/CL/ICLKernel.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/operators/ClConv2d.h"
@@ -61,26 +60,21 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
CLConvolutionLayer::~CLConvolutionLayer() = default;
void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
}
void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
enable_fast_math, num_groups));
- ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
- // Convert post op arguments to ITensorInfo
- auto transformed_post_ops = experimental::transform_post_op_list_arguments<ICLTensor *, ITensorInfo *>(post_ops, [](auto tensor)
- {
- return tensor->info();
- });
- const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups, transformed_post_ops);
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
weights_info, CLScheduler::get().target()))
@@ -97,7 +91,6 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
}
case ConvolutionMethod::FFT:
{
- ARM_COMPUTE_ERROR_ON_MSG(post_ops.size() > 0, "CLFFTConvolutionLayer does not support post ops");
auto f = std::make_unique<CLFFTConvolutionLayer>(_impl->memory_manager);
f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
_impl->func = std::move(f);
@@ -110,31 +103,23 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
if(_impl->op)
{
- _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
- _impl->aux_mem_req = _impl->op->workspace();
- _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
- size_t post_op_tensor_index = 0;
- for(const auto &op : post_ops.get_list())
- {
- for(auto &tensor : op->arguments())
- {
- _impl->run_pack.add_const_tensor(experimental::get_post_op_arg_type(post_op_tensor_index++), *tensor);
- }
- }
- _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
- _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
+ _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
+ _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
}
Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ITensorInfo *> &post_ops)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
const GPUTarget gpu_target = CLScheduler::get().target();
- const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
{
@@ -149,7 +134,6 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
case ConvolutionMethod::FFT:
{
// Validate FFT-based convolution layer
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(post_ops.size() > 0, "CLFFTConvolutionLayer does not support post ops");
ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math));
break;
}
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index ad5bfd8dd2..c8c18f35db 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,7 +31,6 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/operators/ClGemmConv2d.h"
#include "support/Cast.h"
@@ -69,24 +68,19 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> m
CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+ const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
- configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups, post_ops);
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
}
void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- _impl->weights = weights;
- _impl->op = std::make_unique<opencl::ClGemmConv2d>();
- // Convert post op arguments to ITensorInfo
- auto transformed_post_ops = experimental::transform_post_op_list_arguments<ICLTensor *, ITensorInfo *>(post_ops, [](auto tensor)
- {
- return tensor->info();
- });
- const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups, transformed_post_ops);
+ _impl->weights = weights;
+ _impl->op = std::make_unique<opencl::ClGemmConv2d>();
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
_impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
_impl->run_pack =
@@ -96,15 +90,6 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
{ TensorType::ACL_SRC_2, biases },
{ TensorType::ACL_DST, output }
};
- // Add post op tensors
- size_t post_op_tensor_index = 0;
- for(const auto &op : post_ops.get_list())
- {
- for(auto &tensor : op->arguments())
- {
- _impl->run_pack.add_const_tensor(experimental::get_post_op_arg_type(post_op_tensor_index++), *tensor);
- }
- }
_impl->prep_pack =
{
{ TensorType::ACL_SRC_1, weights },
@@ -115,9 +100,9 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
}
Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ITensorInfo *> &post_ops)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
- const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups, post_ops);
+ const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c4b12e770a..3f2223596f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -67,7 +67,6 @@ target_sources(
validation/reference/L2NormalizeLayer.cpp
validation/reference/ActivationLayer.cpp
validation/reference/SpaceToBatch.cpp
- validation/reference/PostOps.cpp
validation/reference/Im2Col.cpp
validation/reference/DequantizationLayer.cpp
validation/reference/DeconvolutionLayer.cpp
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index bced540d2a..986d76708d 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -69,53 +68,30 @@ constexpr float tolerance_num = 0.07f; /**< T
const auto CNNDataTypes = framework::dataset::make("DataType",
{
DataType::F16,
- DataType::F32,
- DataType::QASYMM8,
- DataType::QASYMM8_SIGNED,
+ DataType::F32,
+ DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED,
});
/** Grouped CNN data types */
const auto GroupedCNNDataTypes = framework::dataset::make("DataType",
{
DataType::F16,
- DataType::F32
+ DataType::F32
});
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
{
ActivationLayerInfo(),
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
});
const auto ActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
{
ActivationLayerInfo(),
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
});
-
-bool is_post_op_list_valid_in_gemmconv(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &output_shape, DataType data_type, DataLayout data_layout,
- const PadStrideInfo &conv_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
-{
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
- const auto dilation = Size2D(1U, 1U);
- const unsigned int num_groups = 1U;
-
- TensorInfo input_info(input_shape, 1, data_type, data_layout);
- TensorInfo weights_info(weights_shape, 1, data_type, data_layout);
-
- TensorInfo output_info(output_shape, 1, data_type, data_layout);
-
- WeightsInfo w_info(false, weights_info.dimension(idx_width), weights_info.dimension(idx_height), weights_info.dimension(idx_kernels));
-
- const auto status = CLGEMMConvolutionLayer::validate(&input_info.clone()->set_is_resizable(true),
- &weights_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true),
- conv_info, w_info, dilation, ActivationLayerInfo(), num_groups, post_ops);
- return bool(status);
-}
} // namespace
TEST_SUITE(CL)
@@ -207,72 +183,6 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
enable_fast_math);
ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
}
-
-DATA_TEST_CASE(ValidatePostOpSupportInConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
- framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 17U, 31U), 1, DataType::F32, DataLayout::NHWC), // Select GEMM
- TensorInfo(TensorShape(17U, 31U, 32U), 1, DataType::F32, DataLayout::NCHW), // Select WINOGRAD
- TensorInfo(TensorShape(27U, 27U, 48U), 1, DataType::F32, DataLayout::NCHW), // Select Direct
- TensorInfo(TensorShape(27U, 27U, 48U), 1, DataType::F32, DataLayout::NCHW), // Select FFT
- }),
- framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(2U, 1U, 1U, 19U), 1, DataType::F32, DataLayout::NHWC),
- TensorInfo(TensorShape(5U, 5U, 32U, 19U), 1, DataType::F32, DataLayout::NCHW),
- TensorInfo(TensorShape(5U, 5U, 48U, 128U), 1, DataType::F32, DataLayout::NCHW),
- TensorInfo(TensorShape(11U, 11U, 48U, 24), 1, DataType::F32, DataLayout::NCHW),
- })),
- framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(19U, 17U, 31U), 1, DataType::F32, DataLayout::NHWC),
- TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, DataLayout::NCHW),
- TensorInfo(TensorShape(27U, 27U, 128U), 1, DataType::F32, DataLayout::NCHW),
- TensorInfo(TensorShape(27U, 27U, 24U), 1, DataType::F32, DataLayout::NCHW),
- })),
- framework::dataset::make("ConvInfo", { PadStrideInfo(1U, 1U, 0U, 0U),
- PadStrideInfo(1U, 1U, 2U, 2U),
- PadStrideInfo(1U, 1U, 2U, 2U),
- PadStrideInfo(1U, 1U, 5U, 5U),
- })),
- framework::dataset::make("EnableFastMath", { false, true, false, false})),
- framework::dataset::make("ExpectedMethod",{ ConvolutionMethod::GEMM,
- ConvolutionMethod::WINOGRAD,
- ConvolutionMethod::DIRECT,
- ConvolutionMethod::FFT,
- })),
- framework::dataset::make("PostOpSupported",{ true, false, false, false
- })),
- input_info, weights_info, output_info, conv_info, enable_fast_math, expected_method, post_op_supported)
-{
- const int idx_width = get_data_layout_dimension_index(input_info.data_layout(), DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(input_info.data_layout(), DataLayoutDimension::HEIGHT);
- const int idx_kernels = get_data_layout_dimension_index(input_info.data_layout(), DataLayoutDimension::BATCHES);
-
- const auto dilation = Size2D(1U, 1U);
- const unsigned int num_groups = 1U;
-
- WeightsInfo w_info(false, weights_info.dimension(idx_width), weights_info.dimension(idx_height), weights_info.dimension(idx_kernels));
-
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<ITensorInfo*>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-
- ConvolutionMethod actual_method = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
- &weights_info.clone()->set_is_resizable(true),
- &output_info.clone()->set_is_resizable(true), conv_info,
- WeightsInfo(),
- ActivationLayerInfo(),
- GPUTarget::BIFROST,
- dilation,
- enable_fast_math);
- ARM_COMPUTE_EXPECT(actual_method == expected_method, framework::LogLevel::ERRORS);
- const auto is_valid = CLConvolutionLayer::validate(&input_info.clone()->set_is_resizable(true),
- &weights_info.clone()->set_is_resizable(true),
- nullptr,
- &output_info.clone()->set_is_resizable(true),
- conv_info,
- w_info,
- dilation,
- ActivationLayerInfo(),
- enable_fast_math,
- num_groups,
- post_ops);
- ARM_COMPUTE_EXPECT( bool(is_valid) == post_op_supported, framework::LogLevel::ERRORS);
-}
// clang-format on
// *INDENT-ON*
TEST_SUITE_END() // ConvolutionLayer
@@ -285,167 +195,11 @@ using CLGEMMConvolutionLayerMixedDataLayoutFixture = ConvolutionValidationFixtur
template <typename T>
using CLConvolutionValidationWithPaddingFixture = ConvolutionValidationWithPaddingFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
- const auto conv_info = PadStrideInfo(1, 1, 0, 0);
- const auto input_shape = TensorShape(16U, 14U, 12U, 2U);
- const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
- const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
- const TensorShape post_op_arg0_shape(output_shape);
- TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
- auto post_op_arg1_info = post_op_arg_info.clone();
-
- // Unsupported sequence of post ops
- experimental::PostOpList<ITensorInfo *> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
- &post_op_arg_info,
- 1,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
- post_op_arg1_info.get(),
- 0,
- ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyNHWCIsSupported, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NCHW;
- const auto conv_info = PadStrideInfo(1, 1, 0, 0);
- const auto input_shape = TensorShape(14U, 12U, 16U, 2U);
- const auto weights_shape = TensorShape(1U, 1U, 16U, 24U);
-
- const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
- const TensorShape post_op_arg0_shape(output_shape);
- TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
- experimental::PostOpList<ITensorInfo *> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
- &post_op_arg_info,
- 1,
- ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyFloatingTypeIsSupported, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::QASYMM8;
- const auto data_layout = DataLayout::NHWC;
- const auto conv_info = PadStrideInfo(1, 1, 0, 0);
- const auto input_shape = TensorShape(16U, 14U, 12U, 2U);
- const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
- const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
- const TensorShape post_op_arg0_shape(output_shape);
- TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
- experimental::PostOpList<ITensorInfo *> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
- &post_op_arg_info,
- 1,
- ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyConv1x1Stride1IsSupported_UnsupportedKernelSize, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
- const auto conv_info = PadStrideInfo(1, 1, 0, 0);
- const auto input_shape = TensorShape(16U, 14U, 12U, 2U);
- const auto weights_shape = TensorShape(16U, 3U, 3U, 24U);
-
- const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
- const TensorShape post_op_arg0_shape(output_shape);
- TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
- experimental::PostOpList<ITensorInfo *> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
- &post_op_arg_info,
- 1,
- ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyConv1x1Stride1IsSupported_UnsupportedStride, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
- const auto conv_info = PadStrideInfo(3, 3, 0, 0);
- const auto input_shape = TensorShape(16U, 14U, 12U, 2U);
- const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
- const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
- const TensorShape post_op_arg0_shape(output_shape);
- TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
- experimental::PostOpList<ITensorInfo *> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
- &post_op_arg_info,
- 1,
- ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
- const auto conv_info = PadStrideInfo(1, 1, 0, 0);
- const auto input_shape = TensorShape(16U, 14U, 12U, 2U);
- const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
- const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
- experimental::PostOpList<ITensorInfo *> post_ops{};
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(SupportedPostOps, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
- const auto conv_info = PadStrideInfo(1, 1, 0, 0);
- const auto input_shape = TensorShape(16U, 14U, 12U, 2U);
- const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
- const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
- TensorShape post_op_arg0_shape(output_shape);
- post_op_arg0_shape[1] = 1; // Broadcast in "Y" (second) dimension
- TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
- experimental::PostOpList<ITensorInfo *> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
- &post_op_arg_info,
- 1,
- ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
TEST_SUITE(Float)
TEST_SUITE(FP16)
FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
- framework::dataset::make("ReshapeWeights", { true })),
- framework::dataset::make("DataType",
- DataType::F16)),
- framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+ framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
ActivationFunctionsSmallDataset))
{
// Validate output
@@ -456,10 +210,7 @@ TEST_SUITE_END() // FP16
TEST_SUITE(FP32)
FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
- framework::dataset::make("ReshapeWeights", { true })),
- framework::dataset::make("DataType",
- DataType::F32)),
- framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+ framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
ActivationFunctionsSmallDataset))
{
// Validate output
@@ -503,16 +254,16 @@ using CLGEMMConvolutionLayerQuantizedMixedDataLayoutFixture = ConvolutionValidat
template <typename T>
using CLGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T, int8_t>;
-const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
{
ActivationLayerInfo(),
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
});
const auto QuantizedActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
{
ActivationLayerInfo(),
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
});
TEST_SUITE(Quantized)
@@ -520,8 +271,8 @@ TEST_SUITE(Quantized)
const auto QuantizationData = framework::dataset::make("QuantizationInfo",
{
QuantizationInfo(0.5f, 10),
- QuantizationInfo(0.3f, 3),
- QuantizationInfo(1.1f, 10),
+ QuantizationInfo(0.3f, 3),
+ QuantizationInfo(1.1f, 10),
});
TEST_SUITE(QASYMM8)
@@ -637,9 +388,7 @@ TEST_SUITE(Float)
TEST_SUITE(FP32)
FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallGroupedConvolutionLayerDataset(),
- framework::dataset::make("ReshapeWeights", { true })),
- framework::dataset::make("DataType", DataType::F32)),
- framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+ framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW })),
ActivationFunctionsSmallDataset))
{
// Validate output
@@ -661,9 +410,7 @@ TEST_SUITE_END() // FP32
TEST_SUITE(FP16)
FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallGroupedConvolutionLayerDataset(),
- framework::dataset::make("ReshapeWeights", { true })),
- framework::dataset::make("DataType", DataType::F16)),
- framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+ framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW })),
ActivationFunctionsSmallDataset))
{
// Validate output
diff --git a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
index 7f63a03371..0ddf43766f 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,11 +53,6 @@ using CLGEMMMatrixMultiplyNative = CLSynthetizeOperator<ClGemmMatrixMultiplyNati
template <typename T>
using CLGEMMMatrixMultiplyNativeFixture = GEMMMatrixMultiplyNativeValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
-// Fixture for CLGEMMMatrixMultiplyNative with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyNativeWithPostOpsFixture =
- GEMMMatrixMultiplyNativeWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
-
// Fixture for CLGEMMMatrixMultiplyNative3D
template <typename T>
using CLGEMMMatrixMultiplyNative3DFixture = GEMMMatrixMultiplyNative3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
@@ -146,105 +141,6 @@ const auto boundary_handling_cases = combine(combine(combine(combine(combine(com
broadcast_bias_values),
framework::dataset::make("Activation", ActivationLayerInfo()));
-/** Post Ops */
-using PostOpArgBroadcast = CLGEMMMatrixMultiplyNativeWithPostOpsFixture<float>::PostOpArgBroadcast;
-experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
- std::make_tuple(true, true, false), // If broadcast in dims 0, 1 and 2
- 0,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
- std::make_tuple(false, true, true), // If broadcast in dims 0, 1 and 2
- 1,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- // post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
- std::make_tuple(false, false, false), // If broadcast in dims 0, 1 and 2
- 1,
- ConvertPolicy::SATURATE);
- return post_ops;
-}
-// To test that the output of the main op is the first parameter in prelu post op
-experimental::PostOpList<PostOpArgBroadcast> post_ops_4()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
- post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
- std::make_tuple(false, false, true), // If true, broadcast in corresponding dim: 0, 1 or 2
- 0,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-// To test that the output of the main op is the second parameter in prelu post op i.e. it is the alpha_param
-experimental::PostOpList<PostOpArgBroadcast> post_ops_5()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
- post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
- std::make_tuple(false, false, false), // If true, broadcast in corresponding dim: 0, 1 or 2
- 1,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-/** Different Post Op Lists */
-const auto post_op_lists = framework::dataset::make("post_op_lists", {
- post_ops_1(),
- post_ops_2(),
- post_ops_3(),
- post_ops_4(),
- post_ops_5()
- } );
-
-bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
-{
- const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
- const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
-
- // Create TensorInfo for post op arguments
- TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
- TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
- TensorInfo input2_info(TensorShape(n), 1, data_type);
- TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
-
- GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
- false /**< reinterpret the input as 3D */,
- true /**< Flag used to broadcast the bias addition */,
- false /**< wider accumm */,
- false /**< has pad y */,
- ActivationLayerInfo::ActivationFunction::IDENTITY,
- 1 /**< Multiplication factor for the width of the 1xW transposed block */,
- 1 /**< Multiplication factor for the height of the 4x4 interleaved block */,
- lhs_info,
- rhs_info,
- 0 /**< Offset to be added to each element of the matrix A */,
- 0 /**< Offset to be added to each element of the matrix B */,
- post_ops);
- return bool(ClGemmMatrixMultiplyNativeKernel::validate(&input0_info.clone()->set_is_resizable(true),
- &input1_info.clone()->set_is_resizable(true),
- &input2_info.clone()->set_is_resizable(true),
- &output_info.clone()->set_is_resizable(true),1.f,1.f,
- lhs_info,
- rhs_info,
- gemm_info));
-}
-
/** Configuration test */
void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info)
{
@@ -295,119 +191,6 @@ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned
TEST_SUITE(CL)
TEST_SUITE(GEMMMatrixMultiplyNative)
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 17;
- const unsigned int n = 1;
- const unsigned int k = 13;
- const unsigned int batch = 2;
- TensorShape post_op_arg0_shape(n, m, batch);
- TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
- auto post_op_arg1_info = post_op_arg_info.clone();
-
- // Unsupported sequence of post ops
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
- &post_op_arg_info,
- 1,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
- post_op_arg1_info.get(),
- 0,
- ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
-{
- // Invalid broadcast: post op tensors "widen" the output tensor
- const auto data_type = DataType::F32;
- const unsigned int m = 1;
- const unsigned int n = 18;
- const unsigned int k = 13;
- const unsigned int batch = 2;
- TensorShape post_op_arg_shape(n, m + 1, batch); // output's Y dimension (m) is "widened", which is not allowed
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
-{
- // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(1, m, batch);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- experimental::PostOpList<ITensorInfo*> post_ops{};
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(n, 1, batch);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(1, 1, batch);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(1, 1, 1);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
TEST_SUITE(Float)
TEST_SUITE(FP32)
DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
@@ -541,31 +324,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyNative3DFixture<float>, f
validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
}
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyNativeWithPostOpsFixture<float>, framework::DatasetMode::ALL,
- combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- m_values,
- n_values),
- k_values),
- b_values),
- framework::dataset::make("M0", { 4 })),
- n0_values_precommit),
- k0_values_precommit),
- framework::dataset::make("DataType", DataType::F32)),
- framework::dataset::make("alpha", {1.0f} )),
- framework::dataset::make("beta", {1.0f} )),
- framework::dataset::make("broadcast_bias", { false, true } )),
- framework::dataset::make("Activation", { ActivationLayerInfo() })),
- post_op_lists)
- )
-{
- // Validate output
- validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-TEST_SUITE_END() // FusedPostOps
-
TEST_SUITE_END() // FP32
TEST_SUITE_END() // Float
TEST_SUITE_END() // GEMMMatrixMulipltyNative
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
index cdd89670fa..b06e4bf213 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,6 @@
*/
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -62,21 +61,11 @@ using CLGEMMMatrixMultiplyReshaped = CLSynthetizeOperator<ClGemmMatrixMultiplyRe
template <typename T>
using CLGEMMMatrixMultiplyReshapedFixture = GEMMMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
-// Fixture for CLGEMMMatrixMultiplyReshaped with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyReshapedWithPostOpsFixture =
- GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
-
// Fixture for CLGEMMMatrixMultiplyReshaped mixed precision
template <typename T>
using CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture =
GEMMMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped, true>;
-// Fixture for CLGEMMMatrixMultiplyReshaped mixed precision with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture =
- GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped, true>;
-
// Fixture for CLGEMMMatrixMultiplyReshaped3D
template <typename T>
using CLGEMMMatrixMultiplyReshaped3DFixture = GEMMMatrixMultiplyReshaped3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
@@ -184,108 +173,6 @@ const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", {
/** LHS transposed values */
const auto lhs_transpose_values = framework::dataset::make("lhs_transpose", { false, true } );
-/** Post Ops */
-using PostOpArgBroadcast = CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>::PostOpArgBroadcast;
-experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
- std::make_tuple(true, true, false), // If broadcast in dims 0, 1 and 2
- 0,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
- std::make_tuple(false, true, true), // If broadcast in dims 0, 1 and 2
- 1,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
- std::make_tuple(false, false, true), // If broadcast in dims 0, 1 and 2
- 1,
- ConvertPolicy::SATURATE);
- return post_ops;
-}
-// To test that the output of the main op is the first parameter in prelu post op
-experimental::PostOpList<PostOpArgBroadcast> post_ops_4()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
- post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
- std::make_tuple(false, false, true), // If true, broadcast in corresponding dim: 0, 1 or 2
- 0,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-// To test that the output of the main op is the second parameter in prelu post op i.e. it is the alpha_param
-experimental::PostOpList<PostOpArgBroadcast> post_ops_5()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
- post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
- std::make_tuple(false, false, false), // If true, broadcast in corresponding dim: 0, 1 or 2
- 1,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-/** Different Post Op Lists */
-const auto post_op_lists = framework::dataset::make("post_op_lists", {
- post_ops_1(),
- post_ops_2(),
- post_ops_3(),
- post_ops_4(),
- post_ops_5()
- } );
-
-bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
-{
- const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
- const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
-
- // Create TensorInfo for post op arguments
- TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
- TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
- TensorInfo input2_info(TensorShape(n), 1, data_type);
- TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
-
- const TensorInfo reshaped_input0_info = input0_info.clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(input0_info, lhs_info));
- const TensorInfo reshaped_input1_info = input1_info.clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(input1_info, rhs_info));
-
- GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
- false /**< reinterpret the input as 3D */,
- true /**< Flag used to broadcast the bias addition */,
- false /**< wider accumm */,
- false /**< has pad y */,
- ActivationLayerInfo::ActivationFunction::IDENTITY,
- 1 /**< Multiplication factor for the width of the 1xW transposed block */,
- 1 /**< Multiplication factor for the height of the 4x4 interleaved block */,
- lhs_info,
- rhs_info,
- 0 /**< Offset to be added to each element of the matrix A */,
- 0 /**< Offset to be added to each element of the matrix B */,
- post_ops);
- return bool(ClGemmMatrixMultiplyReshapedKernel::validate(&reshaped_input0_info.clone()->set_is_resizable(true),
- &reshaped_input1_info.clone()->set_is_resizable(true),
- &input2_info.clone()->set_is_resizable(true),
- &output_info.clone()->set_is_resizable(true),1.f,1.f,
- lhs_info,
- rhs_info,
- gemm_info));
-}
-
} // namespace
TEST_SUITE(CL)
@@ -450,119 +337,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
rhs_info,
gemm_info)) == expected, framework::LogLevel::ERRORS);
}
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 17;
- const unsigned int n = 1;
- const unsigned int k = 13;
- const unsigned int batch = 2;
- TensorShape post_op_arg0_shape(n, m, batch);
- TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
- auto post_op_arg1_info = post_op_arg_info.clone();
-
- // Unsupported sequence of post ops
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
- &post_op_arg_info,
- 1,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
- post_op_arg1_info.get(),
- 0,
- ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
-{
- // Invalid broadcast: post op tensors "widen" the output tensor
- const auto data_type = DataType::F32;
- const unsigned int m = 17;
- const unsigned int n = 1;
- const unsigned int k = 13;
- const unsigned int batch = 2;
- TensorShape post_op_arg_shape(n + 4, m, batch); // output's X dimension (n) is "widened", which is not allowed
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
-{
- // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(1, m, batch);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- experimental::PostOpList<ITensorInfo*> post_ops{};
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(n, 1, batch);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(1, 1, batch);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(1, 1, 1);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
+
TEST_SUITE(Float)
TEST_SUITE(FP32)
@@ -697,44 +472,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
framework::ARM_COMPUTE_PRINT_INFO();
}
}
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>, framework::DatasetMode::ALL,
- combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- m_values,
- n_values),
- k_values),
- b_values),
- m0_values_precommit),
- n0_values_precommit),
- k0_values_precommit),
- v0_values_precommit),
- h0_values_precommit),
- framework::dataset::make("interleave_lhs", { false })),
- framework::dataset::make("interleave_rhs", { false })),
- framework::dataset::make("export_to_cl_image_rhs", false)),
- framework::dataset::make("DataType", DataType::F32)),
- a_values_precommit),
- beta_values_precommit),
- framework::dataset::make("broadcast_bias", { true } )),
- lhs_transpose_values),
- act_values),
- post_op_lists)
- )
-{
- // Validate output
- if(validate_result)
- {
- validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
- }
- else
- {
- ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
- framework::ARM_COMPUTE_PRINT_INFO();
- }
-}
-
-TEST_SUITE_END() // FusedPostOps
TEST_SUITE(ExportToCLImage)
DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
@@ -1002,44 +739,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
framework::ARM_COMPUTE_PRINT_INFO();
}
}
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>, framework::DatasetMode::ALL,
- combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- m_values,
- n_values),
- k_values),
- b_values),
- m0_values_precommit),
- n0_values_precommit),
- k0_values_precommit),
- v0_values_precommit),
- h0_values_precommit),
- framework::dataset::make("interleave_lhs", { false })),
- framework::dataset::make("interleave_rhs", { false })),
- framework::dataset::make("export_to_cl_image_rhs", true)),
- framework::dataset::make("DataType", DataType::F32)),
- a_values_precommit),
- beta_values_precommit),
- framework::dataset::make("broadcast_bias", { true } )),
- lhs_transpose_values),
- act_values),
- post_op_lists)
- )
-{
- // Validate output only if validate() is successful
- if(validate_result)
- {
- validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
- }
- else
- {
- ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
- framework::ARM_COMPUTE_PRINT_INFO();
- }
-}
-
-TEST_SUITE_END() // FusedPostOps
TEST_SUITE_END() // ExportToCLImage
TEST_SUITE_END() // FP32
@@ -1178,45 +877,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
}
}
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<half>, framework::DatasetMode::ALL,
- combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- m_values,
- n_values),
- k_values),
- b_values),
- m0_values_precommit),
- n0_values_precommit),
- k0_values_precommit),
- v0_values_precommit),
- h0_values_precommit),
- framework::dataset::make("interleave_lhs", { false })),
- framework::dataset::make("interleave_rhs", { false })),
- framework::dataset::make("export_to_cl_image_rhs", false)),
- framework::dataset::make("DataType", DataType::F16)),
- a_values_precommit),
- beta_values_precommit),
- framework::dataset::make("broadcast_bias", { true } )),
- lhs_transpose_values),
- act_values),
- post_op_lists)
- )
-{
- // Validate output
- if(validate_result)
- {
- validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
- }
- else
- {
- ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
- framework::ARM_COMPUTE_PRINT_INFO();
- }
-}
-
-TEST_SUITE_END() // FusedPostOps
-
TEST_SUITE(ExportToCLImage)
DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16), // OK or incorrect if cl_khr_image2d_from_buffer not supported
@@ -1483,44 +1143,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
framework::ARM_COMPUTE_PRINT_INFO();
}
}
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<half>, framework::DatasetMode::ALL,
- combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- m_values,
- n_values),
- k_values),
- b_values),
- m0_values_precommit),
- n0_values_precommit),
- k0_values_precommit),
- v0_values_precommit),
- h0_values_precommit),
- framework::dataset::make("interleave_lhs", { false })),
- framework::dataset::make("interleave_rhs", { false })),
- framework::dataset::make("export_to_cl_image_rhs", true)),
- framework::dataset::make("DataType", DataType::F16)),
- a_values_precommit),
- beta_values_precommit),
- framework::dataset::make("broadcast_bias", { true } )),
- lhs_transpose_values),
- act_values),
- post_op_lists)
- )
-{
- // Validate output only if validate() is successful
- if(validate_result)
- {
- validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
- }
- else
- {
- ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
- framework::ARM_COMPUTE_PRINT_INFO();
- }
-}
-
-TEST_SUITE_END() // FusedPostOps
TEST_SUITE_END() // ExportToCLImage
TEST_SUITE_END() // FP16
@@ -1659,45 +1281,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionF
}
}
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture<half>, framework::DatasetMode::ALL,
- combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- m_values,
- n_values),
- k_values),
- b_values),
- m0_values_precommit),
- n0_values_precommit),
- k0_values_precommit),
- v0_values_precommit),
- h0_values_precommit),
- framework::dataset::make("interleave_lhs", { false })),
- framework::dataset::make("interleave_rhs", { false })),
- framework::dataset::make("export_to_cl_image_rhs", { true, false })),
- framework::dataset::make("DataType", DataType::F16)),
- a_values_precommit),
- beta_values_precommit),
- framework::dataset::make("broadcast_bias", { true } )),
- lhs_transpose_values),
- act_values),
- post_op_lists)
- )
-{
- // Validate output
- if(validate_result)
- {
- validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
- }
- else
- {
- ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
- framework::ARM_COMPUTE_PRINT_INFO();
- }
-}
-
-TEST_SUITE_END() // FusedPostOps
-
TEST_SUITE_END() // MixedPrecision
TEST_SUITE_END() // Float
TEST_SUITE_END() // GEMMMatrixMultiplyReshaped
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
index 53038c8177..dafc8dc5ec 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,6 @@
*/
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -62,11 +61,6 @@ using CLGEMMMatrixMultiplyReshapedOnlyRHSFixture = GEMMMatrixMultiplyReshapedOnl
template <typename T>
using CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture = GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshapedOnlyRHS>;
-// Fixture for CLGEMMMatrixMultiplyReshapedOnlyRHS with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture =
- GEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshapedOnlyRHS>;
-
namespace
{
// *INDENT-OFF*
@@ -164,106 +158,6 @@ const auto boundary_handling_cases = combine(combine(combine(combine(combine(com
broadcast_bias_values),
framework::dataset::make("Activation", ActivationLayerInfo()));
-/** Post Ops */
-using PostOpArgBroadcast = CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<float>::PostOpArgBroadcast;
-experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
- std::make_tuple(true, true, false), // If broadcast in dims 0, 1 and 2
- 0,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
- std::make_tuple(false, true, true), // If broadcast in dims 0, 1 and 2
- 1,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
- std::make_tuple(false, false, true), // If broadcast in dims 0, 1 and 2
- 1,
- ConvertPolicy::SATURATE);
- return post_ops;
-}
-// To test that the output of the main op is the first parameter in prelu post op
-experimental::PostOpList<PostOpArgBroadcast> post_ops_4()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
- post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
- std::make_tuple(false, false, true), // If true, broadcast in corresponding dim: 0, 1 or 2
- 0,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-// To test that the output of the main op is the second parameter in prelu post op i.e. it is the alpha_param
-experimental::PostOpList<PostOpArgBroadcast> post_ops_5()
-{
- experimental::PostOpList<PostOpArgBroadcast> post_ops{};
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
- post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
- std::make_tuple(false, false, false), // If true, broadcast in corresponding dim: 0, 1 or 2
- 1,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
- return post_ops;
-}
-/** Different Post Op Lists */
-const auto post_op_lists = framework::dataset::make("post_op_lists", {
- post_ops_1(),
- post_ops_2(),
- post_ops_3(),
- post_ops_4(),
- post_ops_5()
- } );
-
- bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
-{
- const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
- const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
-
- // Create TensorInfo for post op arguments
- TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
- TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
- TensorInfo input2_info(TensorShape(n), 1, data_type);
- TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
-
- const TensorInfo reshaped_input1_info = input1_info.clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(input1_info, rhs_info));
-
- GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
- false /**< reinterpret the input as 3D */,
- true /**< Flag used to broadcast the bias addition */,
- false /**< wider accumm */,
- false /**< has pad y */,
- ActivationLayerInfo::ActivationFunction::IDENTITY,
- 1 /**< Multiplication factor for the width of the 1xW transposed block */,
- 1 /**< Multiplication factor for the height of the 4x4 interleaved block */,
- lhs_info,
- rhs_info,
- 0 /**< Offset to be added to each element of the matrix A */,
- 0 /**< Offset to be added to each element of the matrix B */,
- post_ops);
- return bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(&input0_info.clone()->set_is_resizable(true),
- &reshaped_input1_info.clone()->set_is_resizable(true),
- &input2_info.clone()->set_is_resizable(true),
- &output_info.clone()->set_is_resizable(true),1.f,1.f,
- lhs_info,
- rhs_info,
- gemm_info));
-}
/** Configuration test */
bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
@@ -370,119 +264,6 @@ b_value, m0_value, n0_value, k0_value, broadcast_bias, input_as_3d, depth_output
ARM_COMPUTE_EXPECT(status == expected_value, framework::LogLevel::ERRORS);
}
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 17;
- const unsigned int n = 1;
- const unsigned int k = 13;
- const unsigned int batch = 2;
- TensorShape post_op_arg0_shape(n, m, batch);
- TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
- auto post_op_arg1_info = post_op_arg_info.clone();
-
- // Unsupported sequence of post ops
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
- &post_op_arg_info,
- 1,
- ConvertPolicy::SATURATE);
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
- post_op_arg1_info.get(),
- 0,
- ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
-{
- // Invalid broadcast: post op tensors "widen" the output tensor
- const auto data_type = DataType::F32;
- const unsigned int m = 17;
- const unsigned int n = 1;
- const unsigned int k = 1;
- const unsigned int batch = 1;
- TensorShape post_op_arg_shape(n, m, batch + 4); // output's batch dimension is "widened", which is not allowed
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
-{
- // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(1, m, batch);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- experimental::PostOpList<ITensorInfo*> post_ops{};
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(n, 1, batch);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(1, 1, batch);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const unsigned int m = 22;
- const unsigned int n = 16;
- const unsigned int k = 15;
- const unsigned int batch = 3;
- TensorShape post_op_arg_shape(1, 1, 1);
- TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
- experimental::PostOpList<ITensorInfo*> post_ops{};
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
- ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
TEST_SUITE(Float)
TEST_SUITE(FP32)
@@ -684,43 +465,6 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur
}
}
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<float>, framework::DatasetMode::ALL,
- combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- m_values,
- n_values),
- k_values),
- b_values),
- m0_values_precommit),
- n0_values_precommit),
- k0_values_precommit),
- framework::dataset::make("H0", {1})),
- framework::dataset::make("interleave_rhs", { true })),
- t_values_rhs),
- framework::dataset::make("export_to_cl_image_rhs", {false, true})),
- framework::dataset::make("DataType", DataType::F32)),
- a_values),
- beta_values),
- framework::dataset::make("broadcast_bias", { false } )),
- act_values),
- post_op_lists)
- )
-{
- // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
- if(validate_result)
- {
- validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
- }
- else
- {
- ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
- framework::ARM_COMPUTE_PRINT_INFO();
- }
-}
-
-TEST_SUITE_END() // FusedPostOps
-
TEST_SUITE_END() // FP32
TEST_SUITE(FP16)
@@ -849,42 +593,6 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur
framework::ARM_COMPUTE_PRINT_INFO();
}
}
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<half>, framework::DatasetMode::ALL,
- combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
- m_values,
- n_values),
- k_values),
- b_values),
- m0_values_precommit),
- n0_values_precommit),
- k0_values_precommit),
- framework::dataset::make("H0", {1})),
- framework::dataset::make("interleave_rhs", { true })),
- t_values_rhs),
- framework::dataset::make("export_to_cl_image_rhs", true)),
- framework::dataset::make("DataType", DataType::F16)),
- a_values),
- beta_values),
- framework::dataset::make("broadcast_bias", { false } )),
- act_values),
- post_op_lists)
- )
-{
- // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
- if(validate_result)
- {
- validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
- }
- else
- {
- ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
- framework::ARM_COMPUTE_PRINT_INFO();
- }
-}
-
-TEST_SUITE_END() // FusedPostOps
TEST_SUITE_END() // FP16
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
index f27a1796c9..bae8cbf868 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
@@ -58,7 +58,7 @@ TEST_SUITE(DYNAMIC_FUSION)
* No quantized tests | Not supported yet
* No grouped CNN tests | Not supported yet
* No mixed layout tests | Not needed; only NHWC is supported
- * No activation/post op tests | Not needed in fusion
+ * No activation | Not needed in fusion
* No ValidateConvolutionMethod | Only a single method (direct conv2d) is supported
* No ReshapeWeights = true tests | Not applicable yet. This parameter only concerns gemm-based conv2d
* No RunSmallWithPadding tests | Padding is removed
@@ -70,9 +70,7 @@ template <typename T>
using DynamicFusionGpuConv2dFixture = DynamicFusionGpuConv2dValidationFixture<CLTensor, CLAccessor, GpuConv2d, T>;
TEST_SUITE(FP32)
FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuConv2dFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
- framework::dataset::make("DataType", DataType::F32)),
- framework::dataset::make("DataLayout", { DataLayout::NHWC })),
- framework::dataset::make("QuantizationInfo", QuantizationInfo())))
+ framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NHWC })), framework::dataset::make("QuantizationInfo", QuantizationInfo())))
{
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -81,9 +79,7 @@ TEST_SUITE_END() // FP32
TEST_SUITE(FP16)
FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuConv2dFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
- framework::dataset::make("DataType", DataType::F16)),
- framework::dataset::make("DataLayout", { DataLayout::NHWC })),
- framework::dataset::make("QuantizationInfo", QuantizationInfo())))
+ framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NHWC })), framework::dataset::make("QuantizationInfo", QuantizationInfo())))
{
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index f1e0ee9150..afde3d8067 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -21,14 +21,12 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_TEST_GEMM_FIXTURE
-#define ARM_COMPUTE_TEST_GEMM_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "src/core/experimental/PostOpUtils.h"
#include "tests/AssetsLibrary.h"
#include "tests/Globals.h"
#include "tests/IAccessor.h"
@@ -38,7 +36,6 @@
#include "tests/validation/reference/ActivationLayer.h"
#include "tests/validation/reference/ElementwiseOperations.h"
#include "tests/validation/reference/GEMM.h"
-#include "tests/validation/reference/PostOps.h"
#include <random>
@@ -304,8 +301,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
{ ACL_SRC_1, &rhs },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -426,8 +422,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
{ ACL_SRC_1, &rhs },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -580,8 +575,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
{ ACL_SRC_1, &rhs_reshaped },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -734,8 +728,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
{ ACL_SRC_1, &rhs_reshaped },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -908,8 +901,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
{ ACL_SRC_1, &rhs_reshaped },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -960,262 +952,6 @@ protected:
SimpleTensor<T> _reference{};
};
-/** (EXPERIMENTAL_POST_OPS)*/
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
-class GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture : public framework::Fixture
-{
-public:
- using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
-public:
- void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool interleave_lhs,
- bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info,
- const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
- {
- GEMMLHSMatrixInfo lhs_info;
- lhs_info.m0 = m0;
- lhs_info.k0 = k0;
- lhs_info.v0 = v0;
- lhs_info.interleave = interleave_lhs;
- lhs_info.transpose = lhs_transpose;
-
- GEMMRHSMatrixInfo rhs_info;
- rhs_info.n0 = n0;
- rhs_info.k0 = k0;
- rhs_info.h0 = h0;
- rhs_info.interleave = interleave_rhs;
- rhs_info.transpose = !lhs_transpose;
- rhs_info.export_to_cl_image = export_to_cl_image;
-
- // Set the tensor shapes for LHS and RHS matrices
- const TensorShape lhs_shape(k, m, batch_size);
- const TensorShape rhs_shape(n, k, batch_size);
- const TensorShape bias_shape(n,
- broadcast_bias ? 1 : m,
- broadcast_bias ? 1 : batch_size);
- auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
- [ = ](auto broadcast)
- {
- return TensorShape
- {
- std::get<0>(broadcast) ? 1 : n,
- std::get<1>(broadcast) ? 1 : m,
- std::get<2>(broadcast) ? 1 : batch_size,
- };
- });
-
- _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
- if(validate_result)
- {
- _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
- }
- }
-
-protected:
- template <typename U>
- void fill(U &&tensor, int i)
- {
- static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
- using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
- DistributionType distribution{ T(-1.0f), T(1.0f) };
- library->fill(tensor, distribution, i);
-
- // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
- DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
- library->fill_borders_with_garbage(tensor, distribution_inf, i);
- }
-
- TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
- {
- // Create tensors
- TensorType lhs = create_tensor<TensorType>(lhs_shape, data_type, 1);
- TensorType rhs = create_tensor<TensorType>(rhs_shape, data_type, 1);
- TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
-
- // Create post op tensors and populate post op with them
- std::vector<TensorType> post_op_tensors_holder{};
- auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
- [&post_op_tensors_holder, &data_type](auto shape)
- {
- auto t = create_tensor<TensorType>(shape, data_type, 1);
- post_op_tensors_holder.push_back(std::move(t));
- return post_op_tensors_holder.back().info();
- });
- TensorType lhs_reshaped;
- TensorType rhs_reshaped;
- TensorType dst;
-
- const unsigned int M = lhs_shape[1];
- const unsigned int N = rhs_shape[0];
- const unsigned int K = lhs_shape[0];
- GEMMKernelInfo kernel_info;
- kernel_info.m = M;
- kernel_info.n = N;
- kernel_info.k = K;
- kernel_info.depth_output_gemm3d = 0;
- kernel_info.reinterpret_input_as_3d = false;
- kernel_info.broadcast_bias = broadcast_bias;
- kernel_info.activation_info = act_info;
- kernel_info.fp_mixed_precision = fp_mixed_precision;
- kernel_info.post_ops = populated_post_ops;
-
- // The output tensor will be auto-initialized within the function
-
- // Create and configure function
- ReshapeLHSOperatorType reshape_lhs;
- ReshapeRHSOperatorType reshape_rhs;
- GEMMOperatorType gemm;
-
- validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
- validate_result = validate_result || !rhs_info.export_to_cl_image;
- if(!validate_result)
- {
- return nullptr;
- }
-
- reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
- reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
- gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-
- ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
- for(const auto &tensor : post_op_tensors_holder)
- {
- ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
- }
-
- // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
- if(!rhs_info.export_to_cl_image)
- {
- add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
- for(auto &tensor : post_op_tensors_holder)
- {
- add_padding_x({ &tensor });
- }
- }
-
- // Allocate tensors
- lhs.allocator()->allocate();
- rhs.allocator()->allocate();
- lhs_reshaped.allocator()->allocate();
- rhs_reshaped.allocator()->allocate();
- bias.allocator()->allocate();
- dst.allocator()->allocate();
- for(auto &tensor : post_op_tensors_holder)
- {
- tensor.allocator()->allocate();
- }
-
- ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
- for(const auto &tensor : post_op_tensors_holder)
- {
- ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
- }
-
- // Fill tensors
- fill(AccessorType(lhs), 0);
- fill(AccessorType(rhs), 1);
- fill(AccessorType(bias), 2);
- for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
- {
- fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
- }
-
- // Compute GEMM
- ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
- reshape_lhs.run(reshape_lhs_pack);
- ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
- reshape_rhs.run(reshape_rhs_pack);
- ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
- { ACL_SRC_1, &rhs_reshaped },
- { ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
- for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
- {
- gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
- }
- gemm.run(gemm_pack);
-
- return dst;
- }
-
- SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
- const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
- {
- TensorShape dst_shape = lhs_shape;
- dst_shape[0] = rhs_shape[0];
- dst_shape[1] = lhs_shape[1];
-
- // Create reference
- SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
- SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
- SimpleTensor<T> bias{ dst_shape, data_type, 1 };
- // Create post op tensors and populate post op with them
- auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
- {
- return SimpleTensor<T> { shape, data_type, 1 };
- });
-
- const int n = rhs_shape[0];
- const int m = lhs_shape[1];
- const int batch_size = lhs_shape[2];
-
- // Fill reference
- int tensor_idx = 0;
- fill(lhs, tensor_idx++);
- fill(rhs, tensor_idx++);
- fill(bias, tensor_idx++);
- for(auto &op : populated_post_ops.get_list())
- {
- for(auto tensor : op->arguments())
- {
- fill(*tensor, tensor_idx++);
- }
- }
-
- if(broadcast_bias)
- {
- // In case of broadcast, we need to simply copy the first into the following "M" ones
- for(int i = 1; i < m * batch_size; i++)
- {
- memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
- }
- }
-
- SimpleTensor<T> out;
- if(fp_mixed_precision)
- {
- out = reference::gemm_mixed_precision<T>(lhs, rhs, bias, alpha, beta);
- }
- else
- {
- out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
- }
- // Ignore activation info if post ops are used instead
- if(populated_post_ops.size() > 0)
- {
- out = reference::post_ops<T>(out, populated_post_ops);
- }
- else
- {
- out = reference::activation_layer(out, act_info);
- }
- return out;
- }
-
- bool validate_result = true;
- TensorType _target{};
- SimpleTensor<T> _reference{};
-};
-
template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture
{
@@ -1344,8 +1080,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
{ ACL_SRC_1, &rhs_reshaped },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -1515,8 +1250,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
{ ACL_SRC_1, &rhs_reshaped },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -1560,242 +1294,6 @@ protected:
SimpleTensor<T> _reference{};
};
-/** (EXPERIMENTAL_POST_OPS)*/
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
-class GEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsValidationFixture : public framework::Fixture
-{
-public:
- using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
- void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0,
- bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info,
- const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
- {
- GEMMLHSMatrixInfo lhs_info;
- lhs_info.m0 = m0;
- lhs_info.k0 = k0;
-
- GEMMRHSMatrixInfo rhs_info;
- rhs_info.n0 = n0;
- rhs_info.k0 = k0;
- rhs_info.h0 = h0;
- rhs_info.interleave = interleave_rhs;
- rhs_info.transpose = transpose_rhs;
- rhs_info.export_to_cl_image = export_to_cl_image;
-
- // Set the tensor shapes for LHS and RHS matrices
- const TensorShape lhs_shape(k, m, batch_size);
- const TensorShape rhs_shape(n, k, batch_size);
- const TensorShape bias_shape(n,
- broadcast_bias ? 1 : m,
- broadcast_bias ? 1 : batch_size);
- auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
- [ = ](auto broadcast)
- {
- return TensorShape
- {
- std::get<0>(broadcast) ? 1 : n,
- std::get<1>(broadcast) ? 1 : m,
- std::get<2>(broadcast) ? 1 : batch_size,
- };
- });
-
- _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
- if(validate_result)
- {
- _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
- }
- }
-
-protected:
- template <typename U>
- void fill(U &&tensor, int i)
- {
- static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
- using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
- DistributionType distribution{ T(-1.0f), T(1.0f) };
- library->fill(tensor, distribution, i);
-
- // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
- DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
- library->fill_borders_with_garbage(tensor, distribution_inf, i);
- }
-
- TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
- {
- // Create tensors
- TensorType lhs = create_tensor<TensorType>(lhs_shape, data_type, 1);
- TensorType rhs = create_tensor<TensorType>(rhs_shape, data_type, 1);
- TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
- TensorType rhs_reshaped;
- TensorType dst;
- // Create post op tensors and populate post op with them
- std::vector<TensorType> post_op_tensors_holder{};
- auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
- [&post_op_tensors_holder, &data_type](auto shape)
- {
- auto t = create_tensor<TensorType>(shape, data_type, 1);
- post_op_tensors_holder.push_back(std::move(t));
- return post_op_tensors_holder.back().info();
- });
-
- const unsigned int M = lhs_shape[1];
- const unsigned int N = rhs_shape[0];
- const unsigned int K = lhs_shape[0];
- GEMMKernelInfo kernel_info;
- kernel_info.m = M;
- kernel_info.n = N;
- kernel_info.k = K;
- kernel_info.depth_output_gemm3d = 0;
- kernel_info.reinterpret_input_as_3d = false;
- kernel_info.broadcast_bias = broadcast_bias;
- kernel_info.activation_info = act_info;
- kernel_info.post_ops = populated_post_ops;
-
- // The output tensor will be auto-initialized within the function
-
- // Create and configure function
- ReshapeRHSOperatorType reshape_rhs;
- GEMMOperatorType gemm;
-
- validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
- validate_result = validate_result || !rhs_info.export_to_cl_image;
- if(!validate_result)
- {
- return nullptr;
- }
-
- reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
- gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-
- ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
- for(const auto &tensor : post_op_tensors_holder)
- {
- ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
- }
-
- // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
- if(!rhs_info.export_to_cl_image)
- {
- add_padding_x({ &lhs, &rhs, &rhs_reshaped, &bias, &dst });
- for(auto &tensor : post_op_tensors_holder)
- {
- add_padding_x({ &tensor });
- }
- }
-
- // Allocate tensors
- lhs.allocator()->allocate();
- rhs.allocator()->allocate();
- rhs_reshaped.allocator()->allocate();
- bias.allocator()->allocate();
- dst.allocator()->allocate();
- for(auto &tensor : post_op_tensors_holder)
- {
- tensor.allocator()->allocate();
- }
-
- ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
- for(const auto &tensor : post_op_tensors_holder)
- {
- ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
- }
-
- // Fill tensors
- fill(AccessorType(lhs), 0);
- fill(AccessorType(rhs), 1);
- fill(AccessorType(bias), 2);
- for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
- {
- fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
- }
-
- // Compute GEMM
- ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
- reshape_rhs.run(reshape_rhs_pack);
- ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
- { ACL_SRC_1, &rhs_reshaped },
- { ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
- for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
- {
- gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
- }
- gemm.run(gemm_pack);
-
- return dst;
- }
-
- SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
- const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
- {
- TensorShape dst_shape = lhs_shape;
- dst_shape[0] = rhs_shape[0];
- dst_shape[1] = lhs_shape[1];
-
- // Create reference
- SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
- SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
- SimpleTensor<T> bias{ dst_shape, data_type, 1 };
- // Create post op tensors and populate post op with them
- auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
- {
- return SimpleTensor<T> { shape, data_type, 1 };
- });
-
- const int n = rhs_shape[0];
- const int m = lhs_shape[1];
- const int batch_size = lhs_shape[2];
-
- // Fill reference
- int tensor_idx = 0;
- fill(lhs, tensor_idx++);
- fill(rhs, tensor_idx++);
- fill(bias, tensor_idx++);
- for(auto &op : populated_post_ops.get_list())
- {
- for(auto tensor : op->arguments())
- {
- fill(*tensor, tensor_idx++);
- }
- }
-
- if(broadcast_bias)
- {
- // In case of broadcast, we need to simply copy the first into the following "M" ones
- for(int i = 1; i < m * batch_size; i++)
- {
- memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
- }
- }
-
- SimpleTensor<T> out;
- out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
- // Ignore activation info if post ops are used instead
- if(populated_post_ops.size() > 0)
- {
- out = reference::post_ops<T>(out, populated_post_ops);
- }
- else
- {
- out = reference::activation_layer(out, act_info);
- }
- return out;
- }
-
- bool validate_result = true;
- TensorType _target{};
- SimpleTensor<T> _reference{};
-};
-
template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::Fixture
{
@@ -1921,8 +1419,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
{ ACL_SRC_1, &rhs_reshaped },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -2057,8 +1554,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
{ ACL_SRC_1, &rhs },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -2102,212 +1598,6 @@ protected:
};
template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
-class GEMMMatrixMultiplyNativeWithPostOpsValidationFixture : public framework::Fixture
-{
-public:
- using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
-public:
- void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, DataType data_type, float alpha, float beta, bool broadcast_bias,
- const ActivationLayerInfo &act_info, const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
- {
- GEMMLHSMatrixInfo lhs_info;
- lhs_info.m0 = m0;
- lhs_info.k0 = k0;
-
- GEMMRHSMatrixInfo rhs_info;
- rhs_info.n0 = n0;
- rhs_info.k0 = k0;
-
- // Set the tensor shapes for LHS and RHS matrices
- const TensorShape lhs_shape(k, m, batch_size);
- const TensorShape rhs_shape(n, k, batch_size);
- const TensorShape bias_shape(n,
- broadcast_bias ? 1 : m,
- broadcast_bias ? 1 : batch_size);
- const auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
- [ = ](auto broadcast)
- {
- return TensorShape
- {
- std::get<0>(broadcast) ? 1 : n,
- std::get<1>(broadcast) ? 1 : m,
- std::get<2>(broadcast) ? 1 : batch_size,
- };
- });
-
- _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
- _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
- }
-
-protected:
- template <typename U>
- void fill(U &&tensor, int i)
- {
- static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
- using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
- DistributionType distribution{ T(-1.0f), T(1.0f) };
- library->fill(tensor, distribution, i);
-
- // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
- DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
- library->fill_borders_with_garbage(tensor, distribution_inf, i);
- }
-
- TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
- DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
- {
- // Create tensors
- TensorType lhs = create_tensor<TensorType>(lhs_shape, data_type, 1);
- TensorType rhs = create_tensor<TensorType>(rhs_shape, data_type, 1);
- TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
- TensorType dst;
- // Create post op tensors and populate post op with them
- std::vector<TensorType> post_op_tensors_holder{};
- auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
- [&post_op_tensors_holder, &data_type](auto shape)
- {
- auto t = create_tensor<TensorType>(shape, data_type, 1);
- post_op_tensors_holder.push_back(std::move(t));
- return post_op_tensors_holder.back().info();
- });
-
- const unsigned int M = lhs_shape[1];
- const unsigned int N = rhs_shape[0];
- const unsigned int K = lhs_shape[0];
- GEMMKernelInfo kernel_info;
- kernel_info.m = M;
- kernel_info.n = N;
- kernel_info.k = K;
- kernel_info.depth_output_gemm3d = 0;
- kernel_info.reinterpret_input_as_3d = false;
- kernel_info.broadcast_bias = broadcast_bias;
- kernel_info.activation_info = act_info;
- kernel_info.post_ops = populated_post_ops;
-
- // Create and configure function
- GEMMOperatorType gemm;
- gemm.configure(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-
- ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
- for(const auto &tensor : post_op_tensors_holder)
- {
- ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
- }
-
- add_padding_x({ &lhs, &rhs, &bias, &dst });
- for(auto &tensor : post_op_tensors_holder)
- {
- add_padding_x({ &tensor });
- }
-
- // Allocate tensors
- lhs.allocator()->allocate();
- rhs.allocator()->allocate();
- bias.allocator()->allocate();
- dst.allocator()->allocate();
- for(auto &tensor : post_op_tensors_holder)
- {
- tensor.allocator()->allocate();
- }
-
- ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
- ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
- for(const auto &tensor : post_op_tensors_holder)
- {
- ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
- }
-
- // Fill tensors
- fill(AccessorType(lhs), 0);
- fill(AccessorType(rhs), 1);
- fill(AccessorType(bias), 2);
- for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
- {
- fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
- }
-
- // Compute GEMM
- ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
- { ACL_SRC_1, &rhs },
- { ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
- for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
- {
- gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
- }
- gemm.run(gemm_pack);
-
- return dst;
- }
-
- SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
- const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
- {
- TensorShape dst_shape = lhs_shape;
- dst_shape[0] = rhs_shape[0];
- dst_shape[1] = lhs_shape[1];
-
- // Create reference
- SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
- SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
- SimpleTensor<T> bias{ dst_shape, data_type, 1 };
- // Create post op tensors and populate post op with them
- auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
- {
- return SimpleTensor<T> { shape, data_type, 1 };
- });
-
- const int n = rhs_shape[0];
- const int m = lhs_shape[1];
- const int batch_size = lhs_shape[2];
-
- // Fill reference
- int tensor_idx = 0;
- fill(lhs, tensor_idx++);
- fill(rhs, tensor_idx++);
- fill(bias, tensor_idx++);
- for(auto &op : populated_post_ops.get_list())
- {
- for(auto tensor : op->arguments())
- {
- fill(*tensor, tensor_idx++);
- }
- }
-
- if(broadcast_bias)
- {
- // In case of broadcast, we need to simply copy the first into the following "M" ones
- for(int i = 1; i < m * batch_size; i++)
- {
- memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
- }
- }
-
- SimpleTensor<T> out;
- out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
- // Ignore activation info if post ops are used instead
- if(populated_post_ops.size() > 0)
- {
- out = reference::post_ops<T>(out, populated_post_ops);
- }
- else
- {
- out = reference::activation_layer(out, act_info);
- }
- return out;
- }
-
- TensorType _target{};
- SimpleTensor<T> _reference{};
-};
-
-template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
class GEMMMatrixMultiplyNative3DValidationFixture : public framework::Fixture
{
public:
@@ -2398,8 +1688,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
{ ACL_SRC_1, &rhs },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -2557,8 +1846,7 @@ protected:
ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
{ ACL_SRC_1, &rhs_reshaped },
{ ACL_SRC_2, &bias },
- { ACL_DST, &dst }
- });
+ { ACL_DST, &dst } });
gemm.run(gemm_pack);
return dst;
@@ -2608,4 +1896,4 @@ protected:
} // namespace validation
} // namespace test
} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMM_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
diff --git a/tests/validation/reference/PostOps.cpp b/tests/validation/reference/PostOps.cpp
deleted file mode 100644
index ecfed4c48f..0000000000
--- a/tests/validation/reference/PostOps.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "PostOps.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
-#include "support/Cast.h"
-#include "tests/validation/reference/ActivationLayer.h"
-#include "tests/validation/reference/ElementwiseOperations.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> post_ops(const SimpleTensor<T> &a, experimental::PostOpList<SimpleTensor<T>> post_ops)
-{
- // Create reference
- SimpleTensor<T> dst{ a };
-
- for(auto &post_op : post_ops.get_list())
- {
- switch(post_op->type())
- {
- case experimental::PostOpType::Activation:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<SimpleTensor<T>> *>(post_op.get());
- dst = reference::activation_layer(dst, _post_op->_act_info);
- break;
- }
- case experimental::PostOpType::Eltwise_Add:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwiseAdd<SimpleTensor<T>> *>(post_op.get());
- dst = reference::arithmetic_operation(ArithmeticOperation::ADD, dst, _post_op->_addend, dst, _post_op->_policy);
- break;
- }
- case experimental::PostOpType::Eltwise_PRelu:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwisePRelu<SimpleTensor<T>> *>(post_op.get());
-
- // If previous main operation output is the the first pRelu argument, then pass it as src1 parameter of the arithmetic operation
- if(_post_op->_prev_dst_pos == 0)
- {
- dst = reference::arithmetic_operation(ArithmeticOperation::PRELU, dst, _post_op->_alpha_param, dst, _post_op->_policy);
- }
- // If previous main operation output is the the second pRelu argument, then pass it as src2 parameter of the arithmetic operation
- else if(_post_op->_prev_dst_pos == 1)
- {
- dst = reference::arithmetic_operation(ArithmeticOperation::PRELU, _post_op->_alpha_param, dst, dst, _post_op->_policy);
- }
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported PostOpType");
- }
- }
- }
- return dst;
-}
-
-template SimpleTensor<float> post_ops(const SimpleTensor<float> &a, experimental::PostOpList<SimpleTensor<float>> post_ops);
-template SimpleTensor<half> post_ops(const SimpleTensor<half> &a, experimental::PostOpList<SimpleTensor<half>> post_ops);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute \ No newline at end of file
diff --git a/tests/validation/reference/PostOps.h b/tests/validation/reference/PostOps.h
deleted file mode 100644
index 5fe0fe71f5..0000000000
--- a/tests/validation/reference/PostOps.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_POSTOPS_H
-#define ARM_COMPUTE_TEST_POSTOPS_H
-
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "tests/SimpleTensor.h"
-#include "tests/validation/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-/** (EXPERIMENTAL_POST_OPS) */
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> post_ops(const SimpleTensor<T> &a, experimental::PostOpList<SimpleTensor<T>> post_ops);
-
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_POSTOPS_H */
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 4bc326b574..69cc3d4fc0 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -21,8 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef __ARM_COMPUTE_TYPE_PRINTER_H__
-#define __ARM_COMPUTE_TYPE_PRINTER_H__
+
+#ifndef ACL_UTILS_TYPEPRINTER_H
+#define ACL_UTILS_TYPEPRINTER_H
#ifdef ARM_COMPUTE_OPENCL_ENABLED
#include "arm_compute/core/CL/ICLTensor.h"
@@ -36,8 +37,6 @@
#include "arm_compute/core/Strides.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
#include "arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h"
#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
@@ -150,144 +149,6 @@ std::string to_string(const std::vector<T> &args)
return str.str();
}
-/** @name (EXPERIMENTAL_POST_OPS)
- * @{
- */
-/** Formmated output of the @ref experimental::PostOpType type
- *
- * @param[out] os Output stream.
- * @param[in] post_op_type Type to output.
- *
- * @return Modified output stream.
- */
-inline ::std::ostream &operator<<(::std::ostream &os, experimental::PostOpType post_op_type)
-{
- os << "type=";
- switch(post_op_type)
- {
- case experimental::PostOpType::Activation:
- {
- os << "Activation";
- break;
- }
- case experimental::PostOpType::Eltwise_Add:
- {
- os << "Eltwise_Add";
- break;
- }
- case experimental::PostOpType::Eltwise_PRelu:
- {
- os << "Eltwise_PRelu";
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported PostOpType");
- break;
- }
- }
- return os;
-}
-/** Converts a @ref experimental::PostOpType to string
- *
- * @param[in] post_op_type PostOpType value to be converted
- *
- * @return String representing the corresponding PostOpType
- */
-inline std::string to_string(experimental::PostOpType post_op_type)
-{
- std::stringstream str;
- str << post_op_type;
- return str.str();
-}
-/** Formatted output of the @ref experimental::IPostOp type.
- *
- * @param[out] os Output stream.
- * @param[in] post_op Type to output.
- *
- * @return Modified output stream.
- */
-template <typename T>
-inline ::std::ostream &operator<<(::std::ostream &os, const experimental::IPostOp<T> &post_op)
-{
- os << "<";
- os << post_op.type() << ",";
- os << "prev_dst_pos=" << post_op.prev_dst_pos() << ",";
- switch(post_op.type())
- {
- case experimental::PostOpType::Activation:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<T> *>(&post_op);
- os << "act_info=" << &(_post_op->_act_info);
- break;
- }
- case experimental::PostOpType::Eltwise_Add:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwiseAdd<T> *>(&post_op);
- os << "convert_policy=" << _post_op->_policy;
- break;
- }
- case experimental::PostOpType::Eltwise_PRelu:
- {
- const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwisePRelu<T> *>(&post_op);
- os << "convert_policy=" << _post_op->_policy;
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported PostOpType");
- break;
- }
- }
- os << ">";
- return os;
-}
-/** Converts an @ref experimental::IPostOp to string
- *
- * @param[in] post_op IPostOp value to be converted
- *
- * @return String representing the corresponding IPostOp
- */
-template <typename T>
-inline std::string to_string(const experimental::IPostOp<T> &post_op)
-{
- std::stringstream str;
- str << post_op;
- return str.str();
-}
-/** Formatted output of the @ref experimental::PostOpList type.
- *
- * @param[out] os Output stream.
- * @param[in] post_ops Type to output.
- *
- * @return Modified output stream.
- */
-template <typename T>
-inline ::std::ostream &operator<<(::std::ostream &os, const experimental::PostOpList<T> &post_ops)
-{
- os << "[";
- for(const auto &post_op : post_ops.get_list())
- {
- os << *post_op << ",";
- }
- os << "]";
- return os;
-}
-/** Converts a @ref experimental::PostOpList to string
- *
- * @param[in] post_ops PostOpList value to be converted
- *
- * @return String representing the corresponding PostOpList
- */
-template <typename T>
-inline std::string to_string(const experimental::PostOpList<T> &post_ops)
-{
- std::stringstream str;
- str << post_ops;
- return str.str();
-}
-/** @} */ // end of group (EXPERIMENTAL_POST_OPS)
-
/** Formatted output of the Dimensions type.
*
* @param[out] os Output stream.
@@ -399,7 +260,6 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMKernelInfo &gemm
os << " mult_interleave4x4_height=" << gemm_info.mult_interleave4x4_height;
os << " a_offset=" << gemm_info.a_offset;
os << " b_offset=" << gemm_info.b_offset;
- os << "post_ops=" << gemm_info.post_ops;
os << ")";
return os;
}
@@ -1563,7 +1423,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMInfo &info)
os << "fp_mixed_precision=" << info.fp_mixed_precision() << ",";
os << "broadcast_bias=" << info.broadcast_bias() << ",";
os << "pretranspose_B=" << info.pretranspose_B() << ",";
- os << "post_ops=" << info.post_ops() << "}";
+ os << "}";
return os;
}
@@ -2883,7 +2743,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Conv2dInfo &conv_inf
<< "act_info=" << to_string(conv_info.act_info) << ", "
<< "enable_fast_math=" << conv_info.enable_fast_math << ", "
<< "num_groups=" << conv_info.num_groups << ","
- << "post_ops=" << conv_info.post_ops << "}";
+ << "}";
return os;
}
@@ -3772,4 +3632,4 @@ inline std::string to_string(const arm_compute::CpuMatMulSettings &settings)
} // namespace arm_compute
-#endif /* __ARM_COMPUTE_TYPE_PRINTER_H__ */
+#endif // ACL_UTILS_TYPEPRINTER_H