From 1af5416917268692fcd4b34b1d7ffebd3a2aea8a Mon Sep 17 00:00:00 2001 From: SiCongLi Date: Wed, 6 Oct 2021 15:25:57 +0100 Subject: Add experimental PostOp interface to ClGemmMatrixMultiplyReshapedKernel Part 1 This interface supports the fusion of multiple elementwise operations Partially resolves: COMPMID-4435 Change-Id: If68dd7dd98dcf239fde7cb1f0a4a6d4d1e899a6f Signed-off-by: SiCongLi Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6483 Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins --- Android.bp | 4 + SConscript | 6 +- arm_compute/core/KernelDescriptors.h | 69 +- arm_compute/core/Types.h | 6 +- arm_compute/core/experimental/IPostOp.h | 162 +++ arm_compute/core/experimental/Types.h | 5 + src/core/CL/CLUtils.cpp | 97 +- src/core/CL/CLUtils.h | 86 +- .../fp_post_ops_act_eltwise_op_act.h | 101 ++ .../act_eltwise_op_act/gemm_mm_reshaped.cl | 1404 ++++++++++++++++++++ .../fp_elementwise_op_helpers.h | 262 ++++ .../fp_mixed_precision_helpers.h | 113 ++ src/core/CL/cl_kernels/common/gemm.cl | 6 +- src/core/experimental/PostOp.h | 171 +++ src/gpu/cl/ClKernelLibrary.cpp | 8 + .../kernels/ClGemmMatrixMultiplyReshapedKernel.cpp | 42 +- .../kernels/ClGemmMatrixMultiplyReshapedKernel.h | 1 + .../CL/functions/CLBatchNormalizationLayer.cpp | 3 +- tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp | 354 ++++- tests/validation/fixtures/GEMMFixture.h | 261 ++++ tests/validation/reference/PostOps.cpp | 76 ++ tests/validation/reference/PostOps.h | 47 + utils/TypePrinter.h | 163 ++- 23 files changed, 3391 insertions(+), 56 deletions(-) create mode 100644 arm_compute/core/experimental/IPostOp.h create mode 100644 src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h create mode 100644 src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl create mode 100644 src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h create mode 100644 src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h create mode 100644 src/core/experimental/PostOp.h create mode 100644 tests/validation/reference/PostOps.cpp create mode 100644 tests/validation/reference/PostOps.h diff --git a/Android.bp b/Android.bp index adcafa65d5..4ec0475605 100644 --- a/Android.bp +++ b/Android.bp @@ -27,6 +27,10 @@ opencl_srcs = [ "src/core/CL/cl_kernels/common/elementwise_operation.cl", "src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl", "src/core/CL/cl_kernels/common/elementwise_unary.cl", + "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", + "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl", + "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h", + "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "src/core/CL/cl_kernels/common/fft.cl", "src/core/CL/cl_kernels/common/fft_digit_reverse.cl", "src/core/CL/cl_kernels/common/fft_scale.cl", diff --git a/SConscript b/SConscript index 6c5839122a..468d7388cd 100644 --- a/SConscript +++ b/SConscript @@ -277,7 +277,10 @@ if env['opencl'] and env['embed_kernels']: 'src/core/CL/cl_kernels/repeat.h', 'src/core/CL/cl_kernels/tile_helpers.h', 'src/core/CL/cl_kernels/types.h', - 'src/core/CL/cl_kernels/warp_helpers.h' + 'src/core/CL/cl_kernels/warp_helpers.h', + 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h', + 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h', + 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h', ] # Common kernels @@ -307,6 +310,7 @@ if env['opencl'] and env['embed_kernels']: 'src/core/CL/cl_kernels/common/floor.cl', 'src/core/CL/cl_kernels/common/gather.cl', 'src/core/CL/cl_kernels/common/gemm.cl', + 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl', 'src/core/CL/cl_kernels/common/gemv.cl', 'src/core/CL/cl_kernels/common/gemmlowp.cl', 'src/core/CL/cl_kernels/common/generate_proposals.cl', diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h index a6e5c3372e..bc6ec1f6c5 100644 --- a/arm_compute/core/KernelDescriptors.h +++ b/arm_compute/core/KernelDescriptors.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/experimental/IPostOp.h" namespace arm_compute { @@ -52,48 +53,52 @@ struct FFTRadixStageKernelInfo bool is_first_stage{ false }; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */ }; +class ITensorInfo; /** Descriptor used by the GEMM kernels */ struct GEMMKernelInfo { GEMMKernelInfo() = default; GEMMKernelInfo( - unsigned int im, - unsigned int in, - unsigned int ik, - unsigned int idepth_output_gemm3d, - bool ireinterpret_input_as_3d, - bool ibroadcast_bias, - bool ifp_mixed_precision, - bool ihas_pad_y, - ActivationLayerInfo iactivation_info, - int inmult_transpose1xW_width, - int imult_interleave4x4_height, - GEMMLHSMatrixInfo ilhs_info, - GEMMRHSMatrixInfo irhs_info, - int32_t ina_offset, - int32_t inb_offset) + unsigned int im, + unsigned int in, + unsigned int ik, + unsigned int idepth_output_gemm3d, + bool ireinterpret_input_as_3d, + bool ibroadcast_bias, + bool ifp_mixed_precision, + bool ihas_pad_y, + ActivationLayerInfo iactivation_info, + int inmult_transpose1xW_width, + int imult_interleave4x4_height, + GEMMLHSMatrixInfo ilhs_info, + GEMMRHSMatrixInfo irhs_info, + int32_t ina_offset, + int32_t inb_offset, + const experimental::PostOpList &ipost_ops = experimental::PostOpList {}) : m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision), has_pad_y(ihas_pad_y), activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info), - rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset) + rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset), post_ops(ipost_ops) { } - unsigned int m{ 0 }; /**< Number of LHS rows*/ - unsigned int n{ 0 }; /**< Number of RHS columns*/ - unsigned int k{ 0 }; /**< Number of LHS columns or RHS rows */ - unsigned int depth_output_gemm3d{ 0 }; /**< Depth of the output tensor in case is reinterpreted as 3D */ - bool reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */ - bool broadcast_bias{ false }; /**< Flag used to broadcast the bias addition */ - bool fp_mixed_precision{ false }; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */ - bool has_pad_y{ false }; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */ - ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */ - int mult_transpose1xW_width{ 1 }; /**< Multiplication factor for the width of the 1xW transposed block */ - int mult_interleave4x4_height{ 1 }; /**< Multiplication factor for the height of the 4x4 interleaved block */ - GEMMLHSMatrixInfo lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */ - GEMMRHSMatrixInfo rhs_info{}; /**< RHS matrix information used for reshaping the RHS matrix */ - int32_t a_offset{ 0 }; /**< Offset to be added to each element of the matrix A */ - int32_t b_offset{ 0 }; /**< Offset to be added to each element of the matrix B */ - GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */ + unsigned int m{ 0 }; /**< Number of LHS rows*/ + unsigned int n{ 0 }; /**< Number of RHS columns*/ + unsigned int k{ 0 }; /**< Number of LHS columns or RHS rows */ + unsigned int depth_output_gemm3d{ 0 }; /**< Depth of the output tensor in case is reinterpreted as 3D */ + bool reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */ + bool broadcast_bias{ false }; /**< Flag used to broadcast the bias addition */ + bool fp_mixed_precision{ false }; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */ + bool has_pad_y{ false }; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */ + ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */ + int mult_transpose1xW_width{ 1 }; /**< Multiplication factor for the width of the 1xW transposed block */ + int mult_interleave4x4_height{ 1 }; /**< Multiplication factor for the height of the 4x4 interleaved block */ + GEMMLHSMatrixInfo lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */ + GEMMRHSMatrixInfo rhs_info{}; /**< RHS matrix information used for reshaping the RHS matrix */ + int32_t a_offset{ 0 }; /**< Offset to be added to each element of the matrix A */ + int32_t b_offset{ 0 }; /**< Offset to be added to each element of the matrix B */ + GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */ + experimental::PostOpList post_ops{}; /**< (EXPERIMENTAL_POST_OPS) Specifies a list of post ops to be fused after the main op. Note unsupported post ops would not be executed. + * If specified, automatically disable the @ref activation_info */ }; /** Compute descriptor used by the depthwise convolution native kernel */ diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index b2b09825c1..bfe85ea937 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -382,7 +382,11 @@ struct BorderSize /** Container for 2D padding size */ using PaddingSize = BorderSize; -/** Policy to handle overflow */ +/** Policy to handle integer overflow + * @note: This is ignored by floating point operations where the overflow behavior adheres to the IEEE-754 standard + * which states that in case of overflow ±infinity is returned for the round-to-nearest modes (and follows the + * rounding rules for the directed rounding modes) by default. + */ enum class ConvertPolicy { WRAP, /**< Wrap around */ diff --git a/arm_compute/core/experimental/IPostOp.h b/arm_compute/core/experimental/IPostOp.h new file mode 100644 index 0000000000..cd6b8fc4cc --- /dev/null +++ b/arm_compute/core/experimental/IPostOp.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_IPOSTOP +#define ARM_COMPUTE_EXPERIMENTAL_IPOSTOP + +#include +#include +#include + +namespace arm_compute +{ +namespace experimental +{ +/** Type of Post Op */ +enum class PostOpType +{ + Activation, + Eltwise_Add, +}; +/** An ordered sequence of type of Post Ops */ +using PostOpTypeSequence = std::vector; +/** An elementwise n-ary operation that can be appended to and fused with (at kernel-level) other operators + * It contains: + * 1. The attributes of the original operator. + * 2. Any additional tensor argument. + * 3. The postion of the previous op's dst tensor in its argument list ( @ref prev_dst_pos ) + * + * For example, a series of chained ops: + * + * div(src1, relu(conv(src0, weights, bias, conv_info), act_info), div_info) + * + * translates to + * + * dst = conv(src0, weights, bias, conv_info) // main op + * dst = relu(dst, act_info) // previous dst is placed in the first (and only) argument + * dst = div(src1, dst, div_info) // previous dst is placed in the second argument + * + * which in turn translates to: + * + * main op: conv(src0, weights, bias, conv_info) + * post op1: relu(act_info, prev_dst_pos = 0) + * post op2: div(div_info, src1, prev_dst_pos = 1) + * + * NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type + * NOTE: If TensorRelatedT points to a resource, IPostOp assumes that resource is valid throughout its lifetime + * and the lifetime of its copies. This is almost guaranteed as IPostOp is only meant to be used at configure time + * after the ITensor or ITensorInfo objects are already constructed + */ +template +struct IPostOp +{ + /** Get the arity of the post op + * NOTE: that this is one fewer than the arity of the original op, because we implicitly pass the previous op's dst + * tensor as one of the arguments + */ + size_t arity() const + { + return arguments().size(); + } + /** The position of previous op's dst in current op's argument list */ + virtual int prev_dst_pos() const = 0; + /** The IPostOp type */ + virtual PostOpType type() const = 0; + /** The argument tensors + * The order of the argument tensor is strictly preserved + */ + virtual std::vector arguments() = 0; + virtual std::vector arguments() const = 0; + /** Clone method used in cases where PostOps are owned by unique_ptr + * NOTE: This performs a shallow copy of the TensorRelatedT if TensorRelatedT points to a resource + */ + virtual std::unique_ptr> clone() const = 0; + virtual ~IPostOp() + { + } +}; + +/** A sequence of PostOps that can be appended to the end of other operators */ +template +class PostOpList +{ +public: + /** Constructor */ + PostOpList() = default; + /** Destructor */ + ~PostOpList() = default; + PostOpList(const PostOpList &other) + { + for(const auto &op : other._post_ops) + { + this->_post_ops.push_back(op->clone()); + } + } + PostOpList &operator=(const PostOpList &other) + { + PostOpList tmp{ other }; + std::swap(tmp, *this); + return *this; + } + PostOpList(PostOpList &&other) = default; + PostOpList &operator=(PostOpList &&other) = default; + + /** Add a new post op at the end of the list */ + template + void push_back_op(Args &&... args) + { + _post_ops.push_back(std::make_unique(std::forward(args)...)); + } + + /** Number of post ops */ + size_t size() const + { + return _post_ops.size(); + } + + /** Total number of post ops */ + size_t total_num_arguments() const + { + return std::accumulate(_post_ops.begin(), _post_ops.end(), 0, [](size_t op1_arity, const auto & op2) + { + return op1_arity + op2->arity(); + }); + } + + /** Get the underlying post op list */ + std::vector>> &get_list() + { + return _post_ops; + } + const std::vector>> &get_list() const + { + return _post_ops; + } + +private: + std::vector>> _post_ops{}; +}; + +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_IPOSTOP \ No newline at end of file diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h index a478513b1b..c8755dc26c 100644 --- a/arm_compute/core/experimental/Types.h +++ b/arm_compute/core/experimental/Types.h @@ -76,6 +76,11 @@ enum TensorType : int32_t ACL_VEC_COL_SUM = ACL_SRC_4, ACL_SHIFTS = ACL_SRC_5, ACL_MULTIPLIERS = ACL_SRC_6, + + // (EXPERIMENTAL_POST_OPS) Post ops arguments begin after everything else + EXPERIMENTAL_ACL_POST_OP_ARG = 2048, + EXPERIMENTAL_ACL_POST_OP_ARG_FIRST = EXPERIMENTAL_ACL_POST_OP_ARG, + EXPERIMENTAL_ACL_POST_OP_ARG_LAST = EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + 1024, // Max number of post op arguments }; namespace experimental diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp index 67af240044..1da970e705 100644 --- a/src/core/CL/CLUtils.cpp +++ b/src/core/CL/CLUtils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,12 +21,18 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "support/StringSupport.h" #include "src/core/CL/CLUtils.h" +#include "src/core/experimental/PostOp.h" -cl::Image2D arm_compute::create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch) +namespace arm_compute +{ +cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch) { cl_channel_type cl_data_type; @@ -62,3 +68,90 @@ cl::Image2D arm_compute::create_image2d_from_buffer(const cl::Context &ctx, cons return cl::Image2D(cl_image); } + +namespace experimental +{ +PostOpCLKernelUtils::PostOpCLKernelUtils(const Config &supported_config) + : _supported_config(supported_config) +{ + ARM_COMPUTE_ERROR_ON_MSG(supported_config.empty(), "Empty PostOp CL kernel support configuration is not allowed"); + for(auto it = _supported_config.begin(); it != _supported_config.end(); ++it) + { + auto post_op_sequence = it->first; + auto post_op_slots = std::get<1>(it->second); + ARM_COMPUTE_ERROR_ON_MSG(post_op_sequence.size() != post_op_slots.size(), "The number of PostOps must be the same as that of the assigned slots"); + } +} + +bool PostOpCLKernelUtils::are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList &post_ops) +{ + // All post ops must be elementwise and must not alter the shape of the original dst tensor after broadcasting + for(const auto &op : post_ops.get_list()) + { + for(const auto &tensor : op->arguments()) + { + const TensorShape &out_shape = TensorShape::broadcast_shape(dst->tensor_shape(), (*tensor)->tensor_shape()); + if(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0)) + { + return false; + } + } + } + return true; +} + +bool PostOpCLKernelUtils::is_post_op_sequence_supported(const PostOpList &post_ops) const +{ + if(post_ops.size() == 0) + { + return true; // Always support cases where no post op is specified + } + const auto post_op_sequence = get_post_op_sequence(post_ops); + + return _supported_config.find(post_op_sequence) != _supported_config.end(); +} + +void PostOpCLKernelUtils::set_post_ops_cl_build_options(CLBuildOptions &build_opts, const PostOpList &post_ops) const +{ + const auto post_op_sequence = get_post_op_sequence(post_ops); + const auto slots = std::get<1>(_supported_config.at(post_op_sequence)); + for(size_t post_op_id = 0; post_op_id < post_ops.size(); ++post_op_id) + { + const auto &post_op = post_ops.get_list().at(post_op_id); + const auto slot_prefix = "-DP" + support::cpp11::to_string(slots[post_op_id]); + if(post_op->type() == experimental::PostOpType::Activation) + { + const auto _post_op = utils::cast::polymorphic_downcast *>(post_op.get()); + const auto act_type = slot_prefix + "_ACTIVATION_TYPE=" + lower_string(string_from_activation_func(_post_op->_act_info.activation())); + const auto act_a_val = slot_prefix + "_ACTIVATION_A_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.a()); + const auto act_b_val = slot_prefix + "_ACTIVATION_B_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.b()); + build_opts.add_option(act_type); + build_opts.add_option(act_a_val); + build_opts.add_option(act_b_val); + } + else if(post_op->type() == experimental::PostOpType::Eltwise_Add) + { + size_t arg_id = 1; + const auto eltwise_op = slot_prefix + "_ELTWISE_OP=ADD" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos()); + build_opts.add_option(eltwise_op); + for(const auto &tensor : post_op->arguments()) + { + const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1)); + const auto width = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0)); + build_opts.add_option(height); + build_opts.add_option(width); + ++arg_id; + } + } + } +} + +void PostOpCLKernelUtils::set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList &post_ops) const +{ + const auto post_op_sequence = get_post_op_sequence(post_ops); + const auto postfix = std::get<0>(_supported_config.at(post_op_sequence)); + kernel_name += postfix; +} +} // namespace experimental + +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h index b65d547756..d133e4fe6f 100644 --- a/src/core/CL/CLUtils.h +++ b/src/core/CL/CLUtils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,10 +26,13 @@ #define ARM_COMPUTE_CL_CLUTILS_H #include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/experimental/IPostOp.h" namespace arm_compute { class TensorShape; +class CLBuildOptions; +class ITensorInfo; /** Create a cl::Image2D object from an OpenCL buffer * @@ -51,6 +54,87 @@ class TensorShape; */ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch); +namespace experimental +{ +/** @name (EXPERIMENTAL_POST_OPS) + * @{ + */ + +/** Manage validation, building and configurations of PostOp CL kernels */ +class PostOpCLKernelUtils final +{ +public: + /** CL kernel name postfix for post ops */ + using NamePostfix = std::string; + /** CL kernels that supports post ops assign each post op to a 'slot', in accordance with the postfix + * For example, for a kernel with postfix '_act_prelu_eltwiseadd', there are 3 slots + * slot 1: (unary) activation, slot 2: pRelu, slot 3: elementwise addition + * + * Some kernels may allow some slots to be optional, to support multiple combinations of post op sequences. + * In such cases, we need to explicitly set up a mapping between each post op and the slots for that kernel. + * For example, suppose we have 2 kernels with postfixes: _eltwiseadd_prelu, _act_eltwiseadd_act_prelu, where the activations in the + * second kernel are optional. Say we want to support an eltwise addition, followed by a prelu (sequence { eltwiseadd, prelu }). + * Now we can choose which one of the 2 kernels to use, since they both support this post op sequence. + * We can either: + * 1. assign the elementwise to slot 1 and prelu to slot 2 of kernel 1 + * { { Eltwise_Add, PRelu } -> {"_eltwise_act", {1, 2} } } or + * 2. assign the elementwise to slot 2 and prelu to slot 4 of kernel 1 + * { { Eltwise_Add, PRelu } -> {"_act_eltwiseadd_act_prelu", {2, 4} } } + */ + using Slots = std::vector; + using Config = std::map>; + +public: + explicit PostOpCLKernelUtils(const Config &config); + + /** Check if post op argument tensor shapes are compliant + * All post ops must not alter the shape of the original dst tensor (even after broadcasting) + * + * @param[in] dst Dst tensor to apply the post ops to + * @param[in] post_ops Post ops + * + * @return true if shapes are compliant and false otherwise + */ + static bool are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList &post_ops); + /** Check if the post op sequence is supported in the current configuration + * + * @param[in] post_ops Post ops + * + * @return true if the post op sequence is supported and false otherwise + */ + bool is_post_op_sequence_supported(const PostOpList &post_ops) const; + /** Helper function to set PostOp related build options + * @note Convention + * 1. Each post op "slot" is prefixed with "P", followed by the usual parameters for that post op. + * E.g. If the first slot is an activation, we need to pass 3 definitions in this way: + * -P1_ACTIVATION_TYPE=... -P1_ACTIVATION_A_VAL=... -P1_ACTIVATION_B_VAL=... + * + * 2. For multi-ary post ops, to pass the position of the previous op's dest tensor, + * we append "_X_POS_" to the post op type. + * E.g. for a single post op add(dst, x), where dst is the result of the main op. + * In this case, the position of the previous op's dest is 0, so we pass + * -P1_ELTWISE_OP=ADD_X_POS_0 + * + * @param[out] built_opts OpenCL kernel build options + * @param[in] post_ops Post ops + * + */ + void set_post_ops_cl_build_options(CLBuildOptions &built_opts, const PostOpList &post_ops) const; + /** Helper function to set PostOp kernel name + * + * @param[out] kernel_name OpenCL kernel name + * @param[in] post_ops Post ops + * + */ + void set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList &post_ops) const; + +private: + Config _supported_config{}; +}; +/** @} */ // end of group (EXPERIMENTAL_POST_OPS) + +} // namespace experimental + } // arm_compute #endif /* ARM_COMPUTE_CL_CLUTILS_H */ diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h new file mode 100644 index 0000000000..fc9704f13b --- /dev/null +++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "fp_mixed_precision_helpers.h" + +/** (EXPERIMENTAL_POST_OPS) Post Op expansions for the post op sequence: + * act (optional): POST_OP1_ACTIVATION_OPTIONAL + * eltwise_op : POST_OP2_ELTWISE_OP + * act (optional): POST_OP3_ACTIVATION_OPTIONAL + */ + +/** Post Op 1: Activation Block (Optional) + * @name POST_OP1_ACTIVATION_OPTIONAL + * Toggled by -DP1_ACTIVATION_TYPE + * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK + * @{ + */ +#if defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL) +#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \ + MIXED_PRECISION_ACTIVATION_BLOCK(N, P1_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR); +#else // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL) +#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop +#endif // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL) +/** @} */ // end of group POST_OP1_ACTIVATION_OPTIONAL + +/** Post Op 2: Eltwise Op Block + * Handles both broadcasting and non-broadcasting cases + * @name POST_OP2_ELTWISE_OP + * + * @param[in] P2_ELTWISE_ARG1_HEIGHT Height (number of rows) of the @ref ELTWISE_OPERAND_NAME tensor + * @param[in] P2_ELTWISE_ARG1_WIDTH Width (number of columns) of the @ref ELTWISE_OPERAND_NAME tensor + * @param[in] OP The elementwise post op + * @param[in] M0 The number of consecutive rows + * @param[in] N0 The number of consecutive columns + * @param[in] BASENAME The basename of the result variables + * @param[in] ELTWISE_OPERAND_NAME The basename of the other operand variables + * @param[in] DATA_TYPE Data type of the result variables + * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op + * @param[in] ZERO Zero vector for z offset + * @param[in] PARTIAL_LOAD_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) + * @param[in] PARTIAL_LOAD_N0 The partial size in x, for partial blocks. Supported: [0, @p N0) + * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_LOAD_M0 rather than M0. + * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_LOAD_N0 rather than N0. + * @{ + */ +#if defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH) +#if P2_ELTWISE_ARG1_HEIGHT == 1 +#if P2_ELTWISE_ARG1_WIDTH == 1 // Case 1: Broadcasting in both X and Y; op2 arg tile shape[YxX] == [1x1] +#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \ + VEC_DATA_TYPE(DATA_TYPE, 1) \ + ELTWISE_OPERAND_NAME##0 = VLOAD(1)(0, (__global DATA_TYPE *)ELTWISE_OPERAND_NAME##_addr); \ + MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, 1, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp); +#else // P2_ELTWISE_ARG1_WIDTH == 1; Case 2: Broadcasting in only Y; op2 arg tile shape[YxX] == [1xN0] +#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \ + LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, 1, PARTIAL_LOAD_N0, false, PARTIAL_COND_X); \ + MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp); +#endif // P2_ELTWISE_ARG1_WIDTH == 1 +#else // P2_ELTWISE_ARG1_HEIGHT == 1; Case 3: No broadcasting; op2 arg tile shape[YxX] == [M0xN0] +#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * ELTWISE_OPERAND_NAME##_stride_y) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \ + LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X); \ + MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp); +#endif // P2_ELTWISE_ARG1_HEIGHT == 1 +#endif // defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH) +/** @} */ // end of group POST_OP2_ELTWISE_OP + +/** Post Op 3: Activation Block (Optional) + * @name POST_OP3_ACTIVATION_OPTIONAL + * Toggled by -DP3_ACTIVATION_TYPE + * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK + * @{ + */ +#if defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL) +#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \ + MIXED_PRECISION_ACTIVATION_BLOCK(N, P3_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR); +#else // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL) +#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop +#endif // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL) +/** @} */ // end of group POST_OP3_ACTIVATION_OPTIONAL diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl new file mode 100644 index 0000000000..9404c5e6db --- /dev/null +++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl @@ -0,0 +1,1404 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "fp_post_ops_act_eltwise_op_act.h" +#include "gemm_helpers.h" +#include "repeat.h" + +/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped kernel */ + +#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N) +#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH) + +#if defined(MIXED_PRECISION) +#if K0 == 2 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + }) +#elif K0 == 3 // K0 == 3 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + }) +#elif K0 == 4 // K0 == 4 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + c += a.s3 * b.s3; \ + }) +#elif K0 == 8 // K0 == 8 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + c += a.s3 * b.s3; \ + c += a.s4 * b.s4; \ + c += a.s5 * b.s5; \ + c += a.s6 * b.s6; \ + c += a.s7 * b.s7; \ + }) +#elif K0 == 16 // K0 == 16 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + c += a.s3 * b.s3; \ + c += a.s4 * b.s4; \ + c += a.s5 * b.s5; \ + c += a.s6 * b.s6; \ + c += a.s7 * b.s7; \ + c += a.s8 * b.s8; \ + c += a.s9 * b.s9; \ + c += a.sA * b.sA; \ + c += a.sB * b.sB; \ + c += a.sC * b.sC; \ + c += a.sD * b.sD; \ + c += a.sE * b.sE; \ + c += a.sF * b.sF; \ + }) +#else // K0 not supported +#error "K0 value not supported" +#endif // K0 conditions +#else // defined(MIXED_PRECISION) +#if K0 == 2 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + }) +#elif K0 == 3 // K0 == 3 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + }) +#elif K0 == 4 // K0 == 4 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + c = fma(a.s3, b.s3, c); \ + }) +#elif K0 == 8 // K0 == 8 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + c = fma(a.s3, b.s3, c); \ + c = fma(a.s4, b.s4, c); \ + c = fma(a.s5, b.s5, c); \ + c = fma(a.s6, b.s6, c); \ + c = fma(a.s7, b.s7, c); \ + }) +#elif K0 == 16 // K0 == 16 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + c = fma(a.s3, b.s3, c); \ + c = fma(a.s4, b.s4, c); \ + c = fma(a.s5, b.s5, c); \ + c = fma(a.s6, b.s6, c); \ + c = fma(a.s7, b.s7, c); \ + c = fma(a.s8, b.s8, c); \ + c = fma(a.s9, b.s9, c); \ + c = fma(a.sA, b.sA, c); \ + c = fma(a.sB, b.sB, c); \ + c = fma(a.sC, b.sC, c); \ + c = fma(a.sD, b.sD, c); \ + c = fma(a.sE, b.sE, c); \ + c = fma(a.sF, b.sF, c); \ + }) +#else // K0 not supported +#error "K0 value not supported" +#endif // K0 conditions +#endif // defined(MIXED_PRECISION) + +#if defined(ARM_DOT_K0XN0) +#undef ARM_DOT_K0XN0 +#endif // defined(ARM_DOT_K0XN0) + +#if N0 == 2 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + }) +#elif N0 == 3 // N0 == 3 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + }) +#elif N0 == 4 // N0 == 4 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + ARM_DOT_K0((a), (b##3), (c.s3)); \ + }) +#elif N0 == 8 // N0 == 8 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + ARM_DOT_K0((a), (b##3), (c.s3)); \ + ARM_DOT_K0((a), (b##4), (c.s4)); \ + ARM_DOT_K0((a), (b##5), (c.s5)); \ + ARM_DOT_K0((a), (b##6), (c.s6)); \ + ARM_DOT_K0((a), (b##7), (c.s7)); \ + }) +#elif N0 == 16 // N0 == 16 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + ARM_DOT_K0((a), (b##3), (c.s3)); \ + ARM_DOT_K0((a), (b##4), (c.s4)); \ + ARM_DOT_K0((a), (b##5), (c.s5)); \ + ARM_DOT_K0((a), (b##6), (c.s6)); \ + ARM_DOT_K0((a), (b##7), (c.s7)); \ + ARM_DOT_K0((a), (b##8), (c.s8)); \ + ARM_DOT_K0((a), (b##9), (c.s9)); \ + ARM_DOT_K0((a), (b##A), (c.sA)); \ + ARM_DOT_K0((a), (b##B), (c.sB)); \ + ARM_DOT_K0((a), (b##C), (c.sC)); \ + ARM_DOT_K0((a), (b##D), (c.sD)); \ + ARM_DOT_K0((a), (b##E), (c.sE)); \ + ARM_DOT_K0((a), (b##F), (c.sF)); \ + }) +#else // N0 not supported +#error "N0 value not supported" +#endif // N0 conditions + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops: + * Post op 1: activation (optional) + * Post op 2: elementwise op + * Post op 3: activation (optional) + * + * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform + * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2 + * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2 + * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * + * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t, with these additions: + * + * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32 + * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes) + * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes) + * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes) + */ +__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs), + IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), + // Post-Op arguments + IMAGE_DECLARATION(eltwise_operand), + uint k, + uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif //defined(BETA) + uint dst_stride_z, + uint eltwise_operand_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D + ) +{ + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (K0) +#define LHS_STEP_X ((K0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (K0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + +#if defined(DUMMY_WORK_ITEMS) + if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y + + (get_global_id(2) * lhs_stride_z); + + // Compute RHS matrix address + __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_addr += get_global_id(2) * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); + + for(int i = 0; i < k; i += K0) + { + // Supported cases (M0, K0): + // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 + // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 + // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 + // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 + // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 + // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 + // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 + // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero); + + // Accumulate + ARM_DOT_K0XN0(a0, b, c0); +#if M0 > 1 + ARM_DOT_K0XN0(a1, b, c1); +#endif // M0 > 1 +#if M0 > 2 + ARM_DOT_K0XN0(a2, b, c2); +#endif // M0 > 2 +#if M0 > 3 + ARM_DOT_K0XN0(a3, b, c3); +#endif // M0 > 3 +#if M0 > 4 + ARM_DOT_K0XN0(a4, b, c4); +#endif // M0 > 4 +#if M0 > 5 + ARM_DOT_K0XN0(a5, b, c5); +#endif // M0 > 5 +#if M0 > 6 + ARM_DOT_K0XN0(a6, b, c6); +#endif // M0 > 6 +#if M0 > 7 + ARM_DOT_K0XN0(a7, b, c7); +#endif // M0 > 7 + + lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE); + rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + + const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); + const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += get_global_id(2) * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp); + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( + 2) * bias_stride_z; + + LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + + // c = act(c) + POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); + // c = c + eltwise_operand (mix-precision, broadcast, boundary aware) + POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); + // c = act(c) + POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); + + // Store output block + MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp); + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +#undef LHS_STEP_LOOP +#undef RHS_STEP_LOOP +} + +#if defined(OPENCL_IMAGE_SUPPORT) +/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object. + * Post op 1: activation (optional) + * Post op 2: elementwise op + * Post op 3: activation (optional) + * + * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform + * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2 + * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2 + * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * + * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t_texture, with these additions: + * + * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32 + * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes) + * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes) + * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes) + */ +__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs), + __read_only image2d_t rhs_img, +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), + // Post-Op arguments + IMAGE_DECLARATION(eltwise_operand), + uint k, + uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif //defined(BETA) + uint dst_stride_z, + uint eltwise_operand_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D + ) +{ + // Pixel unit +#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0) + + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (K0) +#define LHS_STEP_X ((K0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (K0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (PIXEL_UNIT) +#define RHS_STEP_X (PIXEL_UNIT * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X PIXEL_UNIT +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + +#if defined(DUMMY_WORK_ITEMS) + if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y + + (get_global_id(2) * lhs_stride_z); + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH); +#else // defined(MATRIX_B_DEPTH) + const uint z_rhs = get_global_id(2); +#endif // defined(MATRIX_B_DEPTH) + + // Compute RHS matrix coordinates + uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X; + const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT; + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); + + for(int i = 0; i < K; i += K0) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs); + + // Load values from RHS matrix stored in a cl_image + REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); + LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); + + // Accumulate + ARM_DOT_K0XN0(a0, b, c0); +#if M0 > 1 + ARM_DOT_K0XN0(a1, b, c1); +#endif // M0 > 1 +#if M0 > 2 + ARM_DOT_K0XN0(a2, b, c2); +#endif // M0 > 2 +#if M0 > 3 + ARM_DOT_K0XN0(a3, b, c3); +#endif // M0 > 3 +#if M0 > 4 + ARM_DOT_K0XN0(a4, b, c4); +#endif // M0 > 4 +#if M0 > 5 + ARM_DOT_K0XN0(a5, b, c5); +#endif // M0 > 5 +#if M0 > 6 + ARM_DOT_K0XN0(a6, b, c6); +#endif // M0 > 6 +#if M0 > 7 + ARM_DOT_K0XN0(a7, b, c7); +#endif // M0 > 7 + + lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE); + + x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP; + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + + const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); + const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += get_global_id(2) * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp); + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( + 2) * bias_stride_z; + + LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + + // c = act(c) + POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); + // c = c + eltwise_operand (mix-precision, broadcast, boundary aware) + POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); + // c = act(c) + POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); + + // Store output block + MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp); + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +#undef PIXEL_UNIT +#undef LHS_STEP_LOOP +#undef RHS_STEP_LOOP +} +#endif // defined(OPENCL_IMAGE_SUPPORT) + +#if defined(LHS_TRANSPOSE) + +#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE) + +#if defined(MIXED_PRECISION) + +#if(GPU_ARCH == GPU_ARCH_MIDGARD) +#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))); +#else // GPU_ARCH == GPU_ARCH_MIDGARD +#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c)); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD + +#else // defined(MIXED_PRECISION + +#if(GPU_ARCH == GPU_ARCH_MIDGARD) +#define ARM_VFMA(N0, a, b, c) c += (a) * (b); +#else // GPU_ARCH == GPU_ARCH_MIDGARD +#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c)); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD + +#endif // defined(MIXED_PRECISION) + +#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \ + }) +#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \ + }) +#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \ + }) +#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \ + }) +#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \ + }) + +// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1 +// a is the column-vector (transposed) +// b is the row-vector (not transposed) +// C is the output matrix +// Lower case is a vector (a, b) +// Upper case is a matrix (C) +#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C) + +#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \ + }) +#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \ + }) +#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \ + }) +#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \ + }) +#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \ + }) +#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \ + }) + +// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication. +// The dimensions for this matrix multiplications are defined through M0, N0 and K0 +// The dimensions supported are: +// M0: 1, 2, 3, 4, 8 +// N0: 1, 2, 3, 4, 8, 16 +// K0: 1, 2, 3, 4, 8, 16 +// This macro calls the vector-by-matrix macro K0 times +// A, B and C are matrices +#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \ + CONCAT(ARM_MM_T_NT_M0xN0x, K0) \ + (M0, N0, TYPE, A, B, C) + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops: + * Post op 1: activation (optional) + * Post op 2: elementwise op + * Post op 3: activation (optional) + * + * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform + * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2 + * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2 + * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * + * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt, with these additions: + * + * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32 + * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes) + * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes) + * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes) + */ +__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs), + IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), + // Post-Op arguments + IMAGE_DECLARATION(eltwise_operand), + uint k, + uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif //defined(BETA) + uint dst_stride_z, + uint eltwise_operand_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D + ) +{ + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (M0) +#define LHS_STEP_X ((M0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (M0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (N0) +#define RHS_STEP_X ((N0) * (H0)) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (N0) +#endif // defined(RHS_INTERLEAVE) + + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); + const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); + +#if defined(DUMMY_WORK_ITEMS) + if((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z); + + // Compute RHS matrix address + __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_addr += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0); + + __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr); + __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr); + + for(int i = 0; i < k; i += K0) + { + VEC_DATA_TYPE(DATA_TYPE, M0) + a0; + VEC_DATA_TYPE(DATA_TYPE, N0) + b0; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + +#if K0 > 1 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 1 + +#if K0 > 2 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 2 + +#if K0 > 3 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 3 + +#if K0 > 4 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 4 + +#if K0 > 8 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 8 + +#ifndef LHS_INTERLEAVE + lhs += (M0 * K0 * (V0 - 1)); +#endif // LHS_INTERLEAVE + +#ifndef RHS_INTERLEAVE + rhs += (N0 * K0 * (H0 - 1)); +#endif // RHS_INTERLEAVE + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp); + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( + 2) * bias_stride_z; + + LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + + // c = act(c) + POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); + // c = c + eltwise_operand (mix-precision, broadcast, boundary aware) + POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); + // c = act(c) + POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); + + // Store output block + MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp); + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#if defined(OPENCL_IMAGE_SUPPORT) +/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object. + * Post op 1: activation (optional) + * Post op 2: elementwise op + * Post op 3: activation (optional) + * + * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform + * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2 + * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2 + * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * + * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt_texture, with these additions: + * + * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32 + * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes) + * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes) + * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes) + */ +__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs), + __read_only image2d_t rhs_img, +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), + // Post-Op arguments + IMAGE_DECLARATION(eltwise_operand), + uint k, + uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif //defined(BETA) + uint dst_stride_z, + uint eltwise_operand_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D + ) +{ + // Pixel unit +#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0) + + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (M0) +#define LHS_STEP_X ((M0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (M0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (PIXEL_UNIT) +#define RHS_STEP_X ((PIXEL_UNIT) * (H0)) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (PIXEL_UNIT) +#endif // defined(RHS_INTERLEAVE) + + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z); + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + const uint z_rhs = (z % MATRIX_B_DEPTH); +#else // defined(MATRIX_B_DEPTH) + const uint z_rhs = z; +#endif // defined(MATRIX_B_DEPTH) + + // Compute RHS matrix coordinates + uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X; + const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT; + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0); + + __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr); + + for(int i = 0; i < K; i += K0) + { + VEC_DATA_TYPE(DATA_TYPE, M0) + a0; + VEC_DATA_TYPE(DATA_TYPE, N0) + b0; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + +#if K0 > 1 + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; +#endif // K0 > 1 + +#if K0 > 2 + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; +#endif // K0 > 2 + +#if K0 > 3 + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; +#endif // K0 > 3 + +#if K0 > 4 + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; +#endif // K0 > 4 + +#if K0 > 8 + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; +#endif // K0 > 8 + +#ifndef LHS_INTERLEAVE + lhs += (M0 * K0 * (V0 - 1)); +#endif // LHS_INTERLEAVE + + x_rhs += K0 * RHS_STEP_X; +#ifndef RHS_INTERLEAVE + x_rhs += (PIXEL_UNIT * K0 * (H0 - 1)); +#endif // RHS_INTERLEAVE + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + + const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); + const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp); + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z; + + LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + + // c = act(c) + POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); + // c = c + eltwise_operand (mix-precision, broadcast, boundary aware) + POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); + // c = act(c) + POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); + + // Store output block + MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp); + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +#undef PIXEL_UNIT +#undef LHS_STEP_LOOP +#undef RHS_STEP_LOOP +} +#endif // defined(OPENCL_IMAGE_SUPPORT) + +#endif // defined(LHS_TRANSPOSE) +#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH) +#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N) \ No newline at end of file diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h new file mode 100644 index 0000000000..9ddf51a13c --- /dev/null +++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** (EXPERIMENTAL_POST_OPS) Macros for (binary) elementwise operations */ + +/** List of (binary) elementwise operators, accounting for the argument position of argument X + * @note X_Pos denotes the position of argument X. e.g. X_POS_0 means X is in the first place whereas X_POS_1 means X is in the second place + * @name elementwise_post_ops + * @{ + */ +#if defined(N0) && !defined(VEC_SIZE) +#define VEC_SIZE N0 +#endif // defined(N0) && !defined(VEC_SIZE) + +#if defined(VEC_SIZE) && defined(DATA_TYPE) + +#define ADD_X_POS_0(x, y) (x) + (y) +#define SUB_X_POS_0(x, y) (x) - (y) +#define MAX_X_POS_0(x, y) max(x, y) +#define MIN_X_POS_0(x, y) min(x, y) +#define SQUARED_DIFF_X_POS_0(x, y) (x - y) * (x - y) +#define POWER_X_POS_0(x, y) pow(x, y) +#if VEC_SIZE == 1 +#define PRELU_X_POS_0(x, y) (x > 0 ? x : x * y) +#else // VEC_SIZE == 1 +#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)))) +#endif // VEC_SIZE == 1 +#define DIV_X_POS_0(x, y) (x / y) +#define AND_X_POS_0(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1)) +#define OR_X_POS_0(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1)) + +#define ADD_X_POS_1(x, y) ADD_X_POS_0(x, y) +#define SUB_X_POS_1(x, y) (y) - (x) +#define MAX_X_POS_1(x, y) MAX_X_POS_0(x, y) +#define MIN_X_POS_1(x, y) MIN_X_POS_0(x, y) +#define SQUARED_DIFF_X_POS_1(x, y) SQUARED_DIFF_X_POS_0(x, y) +#define POWER_X_POS_1(x, y) pow(y, x) +#if VEC_SIZE == 1 +#define PRELU_X_POS_1(x, y) (y > 0 ? y : y * x) +#else // VEC_SIZE == 1 +#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)))) +#endif // VEC_SIZE == 1 +#define DIV_X_POS_1(x, y) (y / x) +#define AND_X_POS_1(x, y) AND_X_POS_0(x, y) +#define OR_X_POS_1(x, y) OR_X_POS_0(x, y) + +// By default use the order of the arguments as they are passed in, ie. _X_POS_0 +#define ADD(x, y) ADD_X_POS_0(x, y) +#define SUB(x, y) SUB_X_POS_0(x, y) +#define MAX(x, y) MAX_X_POS_0(x, y) +#define MIN(x, y) MIN_X_POS_0(x, y) +#define SQUARED_DIFF(x, y) SQUARED_DIFF_X_POS_0(x, y) +#define POWER(x, y) POWER_X_POS_0(x, y) +#define PRELU(x, y) PRELU_X_POS_0(x, y) +#define DIV(x, y) DIV_X_POS_0(x, y) +#define AND(x, y) AND_X_POS_0(x, y) +#define OR(x, y) OR_X_POS_0(x, y) + +#endif // defined(VEC_SIZE) && defined(DATA_TYPE) +/** @} */ // end of group elementwise_post_ops + +/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) + * @name ELTWISE_OP_ROW_n + * + * @param[in] OP The elementwise post op + * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables + * @param[in] OPERAND2 The basename of the operand 2 variables + * @{ + */ +#define ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \ + OPERAND1##0 = OP(OPERAND1##0, OPERAND2##0); + +#define ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \ + OPERAND1##1 = OP(OPERAND1##1, OPERAND2##1); + +#define ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \ + OPERAND1##2 = OP(OPERAND1##2, OPERAND2##2); + +#define ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \ + OPERAND1##3 = OP(OPERAND1##3, OPERAND2##3); + +#define ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \ + OPERAND1##4 = OP(OPERAND1##4, OPERAND2##4); + +#define ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \ + OPERAND1##5 = OP(OPERAND1##5, OPERAND2##5); + +#define ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \ + OPERAND1##6 = OP(OPERAND1##6, OPERAND2##6); + +#define ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \ + OPERAND1##7 = OP(OPERAND1##7, OPERAND2##7); + +#define ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \ + OPERAND1##8 = OP(OPERAND1##8, OPERAND2##8); + +#define ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \ + OPERAND1##9 = OP(OPERAND1##9, OPERAND2##9); + +#define ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \ + OPERAND1##A = OP(OPERAND1##A, OPERAND2##A); + +#define ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \ + OPERAND1##B = OP(OPERAND1##B, OPERAND2##B); + +#define ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \ + OPERAND1##C = OP(OPERAND1##C, OPERAND2##C); + +#define ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \ + OPERAND1##D = OP(OPERAND1##D, OPERAND2##D); + +#define ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \ + OPERAND1##E = OP(OPERAND1##E, OPERAND2##E); + +#define ELTWISE_OP_ROW_16(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \ + OPERAND1##F = OP(OPERAND1##F, OPERAND2##F); + +/** @} */ // end of group ELTWISE_OP_ROW_n + +/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) + * @name ELTWISE_OP_BLOCK + * + * Supported cases are N=1,2,3,...,16 + * + * @param[in] OP The elementwise post op + * @param[in] N The number of vectors in the block + * @param[in] OPERAND1 The basename of the destination and operand 1 variables + * @param[in] OPERAND2 The basename of the operand 2 variables + * @{ + */ +#define ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_##N(OP, OPERAND1, OPERAND2) +#define ELTWISE_OP_BLOCK(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) +/** @} */ // end of group ELTWISE_OP_BLOCK + +/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting + * @name ELTWISE_OP_ROW_BROADCAST_n + * + * @param[in] OP The elementwise post op + * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables + * @param[in] OPERAND2 The basename of the broadcast operand 2 variables + * @{ + */ +#define ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \ + OPERAND1##0 = OP(OPERAND1##0, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \ + OPERAND1##1 = OP(OPERAND1##1, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \ + OPERAND1##2 = OP(OPERAND1##2, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \ + OPERAND1##3 = OP(OPERAND1##3, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \ + OPERAND1##4 = OP(OPERAND1##4, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \ + OPERAND1##5 = OP(OPERAND1##5, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \ + OPERAND1##6 = OP(OPERAND1##6, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \ + OPERAND1##7 = OP(OPERAND1##7, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \ + OPERAND1##8 = OP(OPERAND1##8, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \ + OPERAND1##9 = OP(OPERAND1##9, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \ + OPERAND1##A = OP(OPERAND1##A, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \ + OPERAND1##B = OP(OPERAND1##B, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \ + OPERAND1##C = OP(OPERAND1##C, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \ + OPERAND1##D = OP(OPERAND1##D, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \ + OPERAND1##E = OP(OPERAND1##E, OPERAND2); + +#define ELTWISE_OP_ROW_BROADCAST_16(OP, OPERAND1, OPERAND2) \ + ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \ + OPERAND1##F = OP(OPERAND1##F, OPERAND2); + +/** @} */ // end of group ELTWISE_OP_ROW_BROADCAST_n + +/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting + * @name ELTWISE_OP_BLOCK_BROADCAST + * @note Only support: + * case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX]; + * case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar); + * Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1]; + * + * Supported cases are N=1,2,3,...,16 + * + * @param[in] OP The elementwise post op + * @param[in] N The number of vectors in the block + * @param[in] OPERAND1 The basename of the destination and operand 1 variables + * @param[in] OPERAND2 The basename of the operand 2 variables + * @{ + */ +#define ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_BROADCAST_##N(OP, OPERAND1, OPERAND2) +#define ELTWISE_OP_BLOCK_BROADCAST(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) +/** @} */ // end of group ELTWISE_OP_BLOCK_BROADCAST \ No newline at end of file diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h new file mode 100644 index 0000000000..e8df121142 --- /dev/null +++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "fp_elementwise_op_helpers.h" +#include "gemm_helpers.h" +#include "load_store_utility.h" + +/** (EXPERIMENTAL_POST_OPS) Convenience macros for automatically handling mixed precision (fp16 and fp32) operations + * -DMIXED_PRECISION toggles mixed precision mode + */ + +/** Mixed-Precision-Aware Activation Block + * @name MIXED_PRECISION_ACTIVATION_BLOCK + * params N ... B_VAL: same as those in @ref ACTIVATION_BLOCK + * + * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op + * @{ + */ +#if defined(MIXED_PRECISION) +#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \ + ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME, A_VAL, B_VAL); +#else // defined(MIXED_PRECISION) +#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \ + ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL); +#endif // defined(MIXED_PRECISION) +/** @} */ // end of group MIXED_PRECISION_ACTIVATION_BLOCK + +/** Mixed-Precision-Aware Elementwise Op Block + * Performs OPERAND1 = OP(OPERAND1, OPERAND2) + * @name MIXED_PRECISION_ELTWISE_OP_BLOCK + * + * @param[in] OP The elementwise post op + * @param[in] M0 The number of consecutive rows + * @param[in] N0 The number of consecutive columns + * @param[in] OPERAND1 The basename of the first and result operand variables + * @param[in] OPERAND2 The basename of the second operand variables + * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op + * @param[in] CONVERTED_OPERAND2 The basename of the second operand variables converted to higher-precision in case of mixed-precision op + * @{ + */ +#if defined(MIXED_PRECISION) +#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \ + CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2); \ + ELTWISE_OP_BLOCK(OP, M0, OPERAND1, CONVERTED_OPERAND2); +#else // defined(MIXED_PRECISION) +#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \ + ELTWISE_OP_BLOCK(OP, M0, OPERAND1, OPERAND2); +#endif // defined(MIXED_PRECISION) +/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK + +/** Mixed-Precision-Aware Elementwise Op Broadcast Block + * Performs OPERAND1 = OP(OPERAND1, OPERAND2) + * @name MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST + * @note Only support: + * case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX]; this means @p N0 > 1 + * case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar) ; this means @p N0 == 1 + * Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1]; this means @p M0 should never == 1 + * + * @param[in] OP The elementwise post op + * @param[in] M0 The number of consecutive rows, > 1 + * @param[in] N0 The number of consecutive columns, >= 1 + * @param[in] OPERAND1 The basename of the first and result operand variables + * @param[in] OPERAND2 The basename of the second operand variables + * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op + * @param[in] CONVERTED_OPERAND2 The basename of the second operand variables converted to higher-precision in case of mixed-precision op + * @{ + */ +#if defined(MIXED_PRECISION) +#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \ + CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2); \ + ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, CONVERTED_OPERAND2##0); +#else // defined(MIXED_PRECISION) +#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \ + ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, OPERAND2##0); +#endif // defined(MIXED_PRECISION) +/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST + +/** Mixed-Precision-Aware Boundary-Aware Store Block + * @name MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE + * params M0 ... PARTIAL_COND_X, same as those in STORE_BLOCK_BOUNDARY_AWARE + * + * @param[in] BASENAME_LP The name of the low precision variables, converted from BASENAME, in case of mixed-precision op + * @{ + */ +#if defined(MIXED_PRECISION) +#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \ + CONVERT_BLOCK(M0, N0, DATA_TYPE, BASENAME, BASENAME_LP); \ + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME_LP, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X); +#else // defined(MIXED_PRECISION) +#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \ + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X); +#endif // defined(MIXED_PRECISION) +/** @} */ // end of group MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE \ No newline at end of file diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl index 431c97becc..dd03147ad6 100644 --- a/src/core/CL/cl_kernels/common/gemm.cl +++ b/src/core/CL/cl_kernels/common/gemm.cl @@ -2524,6 +2524,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), /** This OpenCL kernel computes the matrix multiplication between 2 matrices. * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed + * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl * * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float) @@ -2798,6 +2799,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object. * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed + * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl * * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) @@ -3179,6 +3181,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs), /** This OpenCL kernel computes the matrix multiplication between 2 matrices. * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed + * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl * * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE). * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. @@ -3560,6 +3563,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object. * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed + * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl * * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE). @@ -3929,7 +3933,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs), #endif // defined(LHS_TRANSPOSE) -#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE) +#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N) #if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE) diff --git a/src/core/experimental/PostOp.h b/src/core/experimental/PostOp.h new file mode 100644 index 0000000000..64414d2050 --- /dev/null +++ b/src/core/experimental/PostOp.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_POSTOP +#define ARM_COMPUTE_EXPERIMENTAL_POSTOP + +#include "arm_compute/core/experimental/IPostOp.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/experimental/Types.h" +#include "support/Cast.h" + +#include + +/** (EXPERIMENTAL_POST_OPS) */ +namespace arm_compute +{ +namespace experimental +{ +template +struct PostOpAct : public IPostOp +{ +public: + PostOpAct(const ActivationLayerInfo &act_info) + : _act_info{ act_info } + { + } + // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy + ~PostOpAct() override = default; + PostOpAct(const PostOpAct &) = default; + PostOpAct &operator=(const PostOpAct &) = default; + PostOpAct(PostOpAct &&) = default; + PostOpAct &operator=(PostOpAct &&) = default; + + int prev_dst_pos() const override + { + return 0; + } + PostOpType type() const override + { + return PostOpType::Activation; + } + std::vector arguments() override + { + return {}; + } + std::vector arguments() const override + { + return {}; + } + std::unique_ptr> clone() const override + { + return std::make_unique>(*this); + } + ActivationLayerInfo _act_info; +}; + +template +struct PostOpEltwiseAdd : public IPostOp +{ +public: + PostOpEltwiseAdd(TensorRelatedT addend, int prev_op_arg_pos, ConvertPolicy policy) + : _addend{ addend }, + _prev_op_arg_pos{ prev_op_arg_pos }, + _policy{ policy } + { + } + // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy + ~PostOpEltwiseAdd() override = default; + PostOpEltwiseAdd(const PostOpEltwiseAdd &) = default; + PostOpEltwiseAdd &operator=(const PostOpEltwiseAdd &) = default; + PostOpEltwiseAdd(PostOpEltwiseAdd &&) = default; + PostOpEltwiseAdd &operator=(PostOpEltwiseAdd &&) = default; + int prev_dst_pos() const override + { + return _prev_op_arg_pos; + } + PostOpType type() const override + { + return PostOpType::Eltwise_Add; + } + std::vector arguments() override + { + return { &_addend }; + } + std::vector arguments() const override + { + return { &_addend }; + } + std::unique_ptr> clone() const override + { + return std::make_unique>(*this); + } + TensorRelatedT _addend; + int _prev_op_arg_pos; + ConvertPolicy _policy; +}; + +/** Transform a PostOpList of type FromTensorT to one of type ToTensorT */ +template +PostOpList transform_post_op_list_arguments(const PostOpList &post_ops, std::function transform_arg) +{ + PostOpList transformed_post_ops; + int op_idx = 0; + for(const auto &post_op : post_ops.get_list()) + { + switch(post_op->type()) + { + case PostOpType::Activation: + { + const auto _post_op = utils::cast::polymorphic_downcast *>(post_op.get()); + transformed_post_ops.template push_back_op>(_post_op->_act_info); + break; + } + case PostOpType::Eltwise_Add: + { + const auto _post_op = utils::cast::polymorphic_downcast *>(post_op.get()); + transformed_post_ops.template push_back_op>(transform_arg(_post_op->_addend), _post_op->_prev_op_arg_pos, _post_op->_policy); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported PostOpType"); + } + } + ++op_idx; + } + return transformed_post_ops; +} + +/** Get post op argument TensorType from post op argument index in a flattened, ordered post op argument list */ +inline TensorType get_post_op_arg_type(size_t index) +{ + ARM_COMPUTE_ERROR_ON_MSG(static_cast(index) > EXPERIMENTAL_ACL_POST_OP_ARG_LAST - EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, "Post Op argument index is out of range"); + return static_cast(EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + static_cast(index)); +} + +template +PostOpTypeSequence get_post_op_sequence(const PostOpList &post_ops) +{ + PostOpTypeSequence post_op_sequence; + for(const auto &op : post_ops.get_list()) + { + post_op_sequence.push_back(op->type()); + } + return post_op_sequence; +} + +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_POSTOP \ No newline at end of file diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp index 9d524f936e..cbc4caf5f6 100644 --- a/src/gpu/cl/ClKernelLibrary.cpp +++ b/src/gpu/cl/ClKernelLibrary.cpp @@ -276,6 +276,10 @@ const std::map ClKernelLibrary::_kernel_program_map = { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" }, { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" }, { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" }, + { "gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" }, + { "gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" }, + { "gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" }, + { "gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" }, { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" }, { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" }, { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" }, @@ -579,6 +583,10 @@ const std::map ClKernelLibrary::_program_source_map = { "common/gemm.cl", #include "./cl_kernels/common/gemm.clembed" + }, + { + "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl", +#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.clembed" }, { "common/gemmlowp.cl", diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp index 3a39128c0a..4b28e2badc 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp @@ -34,6 +34,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" +#include "src/core/experimental/PostOp.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/helpers/float_ops.h" @@ -51,6 +52,16 @@ namespace { using ElementsProcessed = Steps; +const auto post_op_utils = experimental::PostOpCLKernelUtils( +{ + // PostOp sequence -> {Kernel Postfix, PostOp Slots} + { {}, { "", {} } }, + { { experimental::PostOpType::Activation }, { "", { 1 } } }, + { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } }, + { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } }, + { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } }, + { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } } +}); Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) @@ -74,6 +85,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type"); ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported"); const unsigned int m = gemm_info.m; const unsigned int n = gemm_info.n; @@ -117,6 +129,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant"); } return Status{}; @@ -180,6 +193,7 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi _add_bias = src2 != nullptr; _export_to_cl_image = rhs_info.export_to_cl_image; _k = gemm_info.k; + _num_post_op_args = gemm_info.post_ops.total_num_arguments(); // Check if we need to slide the matrix B const unsigned int num_dimensions_src0 = src0->num_dimensions(); @@ -222,9 +236,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE"); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION"); build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); @@ -240,11 +251,23 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + // If post_ops are used, then we disable the use of gemm_info.activation_info + if(gemm_info.post_ops.size() > 0) + { + post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops); + } + else + { + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + } std::string kernel_name("gemm_mm_reshaped_"); kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt"; kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; + post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops); // Create kernel _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); @@ -360,6 +383,13 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind // dst buffer add_2D_tensor_argument(idx, dst, slice); + // post op argument buffers + for(size_t i = 0; i < _num_post_op_args; ++i) + { + const auto post_op_arg = utils::cast::polymorphic_downcast(tensors.get_const_tensor(experimental::get_post_op_arg_type(i))); + add_2D_tensor_argument(idx, post_op_arg, slice); + } + // K dimension (not used if _export_to_cl_image == true) _kernel.setArg(idx++, static_cast(_k)); @@ -378,6 +408,12 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind // dst stride_z _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); + // post op argument stride_z + for(size_t i = 0; i < _num_post_op_args; ++i) + { + const auto post_op_arg = utils::cast::polymorphic_downcast(tensors.get_const_tensor(experimental::get_post_op_arg_type(i))); + _kernel.setArg(idx++, static_cast(post_op_arg->info()->strides_in_bytes()[2])); + } // Cross-plan padding (if _reinterpret_output_as_3d = true) if(_reinterpret_output_as_3d) { diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h index b320d318e9..09160ec0d1 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h @@ -106,6 +106,7 @@ private: bool _add_bias{ false }; bool _export_to_cl_image{ false }; unsigned int _k{ 1 }; + unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments }; } // namespace kernels } // namespace opencl diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp index e8affc0853..234a0df2aa 100644 --- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp @@ -29,11 +29,10 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { CLBatchNormalizationLayer::CLBatchNormalizationLayer() diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp index fd12dea4fe..b13c380470 100644 --- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp +++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "src/core/experimental/PostOp.h" #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h" #include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" #include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" @@ -61,11 +62,21 @@ using CLGEMMMatrixMultiplyReshaped = CLSynthetizeOperator using CLGEMMMatrixMultiplyReshapedFixture = GEMMMatrixMultiplyReshapedValidationFixture; +// Fixture for CLGEMMMatrixMultiplyReshaped with post ops +template +using CLGEMMMatrixMultiplyReshapedWithPostOpsFixture = + GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture; + // Fixture for CLGEMMMatrixMultiplyReshaped mixed precision template using CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture = GEMMMatrixMultiplyReshapedValidationFixture; +// Fixture for CLGEMMMatrixMultiplyReshaped mixed precision with post ops +template +using CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture = + GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture; + // Fixture for CLGEMMMatrixMultiplyReshaped3D template using CLGEMMMatrixMultiplyReshaped3DFixture = GEMMMatrixMultiplyReshaped3DValidationFixture; @@ -172,6 +183,65 @@ const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", { /** LHS transposed values */ const auto lhs_transpose_values = framework::dataset::make("lhs_transpose", { false, true } ); +/** Post Ops */ +using PostOpArgBroadcast = CLGEMMMatrixMultiplyReshapedWithPostOpsFixture::PostOpArgBroadcast; +experimental::PostOpList empty_post_ops() +{ + return experimental::PostOpList{}; +} + +experimental::PostOpList post_ops_1() +{ + experimental::PostOpList post_ops{}; + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F}); + post_ops.push_back_op>( + std::make_tuple(true, true, false), // If broadcast in dims 0, 1 and 2 + 0, + ConvertPolicy::SATURATE); + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); + return post_ops; +} +experimental::PostOpList post_ops_2() +{ + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( + std::make_tuple(false, true, true), // If broadcast in dims 0, 1 and 2 + 1, + ConvertPolicy::SATURATE); + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); + return post_ops; +} +experimental::PostOpList post_ops_3() +{ + experimental::PostOpList post_ops{}; + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); + post_ops.push_back_op>( + std::make_tuple(false, false, true), // If broadcast in dims 0, 1 and 2 + 1, + ConvertPolicy::SATURATE); + return post_ops; +} +experimental::PostOpList invalid_post_ops_1() +{ + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( + std::make_tuple(true, true, false), // If broadcast in dims 0, 1 and 2 + 1, + ConvertPolicy::SATURATE); + post_ops.push_back_op>( + std::make_tuple(false, true, false), // If broadcast in dims 0, 1 and 2 + 0, + ConvertPolicy::SATURATE); + return post_ops; +} + +/** Different Post Op Lists */ +const auto post_op_lists = framework::dataset::make("post_op_lists", { + post_ops_1(), + post_ops_2(), + post_ops_3(), + } ); + } // namespace TEST_SUITE(CL) @@ -328,7 +398,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi framework::dataset::make("Expected", { true, true, false, false, false, true, true,true})), input0_info ,input1_info, input2_info, output_info, lhs_info, rhs_info, gemm_info, expected) { - ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true), + ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true), &input1_info.clone()->set_is_resizable(true), &input2_info.clone()->set_is_resizable(true), &output_info.clone()->set_is_resizable(true),1.f,1.f, @@ -336,6 +406,116 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi rhs_info, gemm_info)) == expected, framework::LogLevel::ERRORS); } +DATA_TEST_CASE(ValidateFusedPosOps, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(zip( + framework::dataset::make("Input0Info", { TensorInfo(TensorShape(64U, 5U, 2U), 1, DataType::F32), // OK. Empty post ops + TensorInfo(TensorShape(64U, 5U, 2U), 1, DataType::F32), // Invalid post op sequences + TensorInfo(TensorShape(64U, 5U, 2U), 1, DataType::F32), // OK. Supported post ops + + }), + framework::dataset::make("Input1Info",{ TensorInfo(TensorShape(64U, 6U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(64U, 6U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(64U, 6U, 2U), 1, DataType::F32), + + })), + framework::dataset::make("Input2Info", { TensorInfo(TensorShape(21U), 1, DataType::F32), + TensorInfo(TensorShape(21U), 1, DataType::F32), + TensorInfo(TensorShape(21U), 1, DataType::F32), + + })), + framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U,17U,2U), 1, DataType::F32), + TensorInfo(TensorShape(21U,17U,2U), 1, DataType::F32), + TensorInfo(TensorShape(21U,17U,2U), 1, DataType::F32), + + })), + framework::dataset::make("LHSMInfo",{ + GEMMLHSMatrixInfo(4,4,1,false,true), + GEMMLHSMatrixInfo(4,4,1,false,true), + GEMMLHSMatrixInfo(4,4,1,false,true), + + })), + framework::dataset::make("RHSMInfo",{ + GEMMRHSMatrixInfo(4,4,1,true,true,false), + GEMMRHSMatrixInfo(4,4,1,true,true,false), + GEMMRHSMatrixInfo(4,4,1,true,true,false), + + + })), + + + framework::dataset::make("GEMMInfo",{ + GEMMKernelInfo( 17 /** post_op_tensor_infos; + auto populated_post_ops = experimental::transform_post_op_list_arguments(post_ops, + [&output_info, &post_op_tensor_infos](auto broadcast){ + post_op_tensor_infos.emplace_back(TensorShape{ + std::get<0>(broadcast) ? 1 : output_info.dimension(0), + std::get<1>(broadcast) ? 1 : output_info.dimension(1), + std::get<2>(broadcast) ? 1 : output_info.dimension(2) + }, 1, output_info.data_type()); + return &post_op_tensor_infos.back(); + }); + GEMMKernelInfo gemm_info_with_post_ops(std::move(gemm_info)); + gemm_info_with_post_ops.post_ops = populated_post_ops; + ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true), + &input1_info.clone()->set_is_resizable(true), + &input2_info.clone()->set_is_resizable(true), + &output_info.clone()->set_is_resizable(true),1.f,1.f, + lhs_info, + rhs_info, + gemm_info_with_post_ops)) == expected, framework::LogLevel::ERRORS); +} TEST_SUITE(Float) TEST_SUITE(FP32) @@ -438,6 +618,37 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture, // Validate output validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); } +TEST_SUITE(FusedPostOps) + +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_values, + n_values), + k_values), + b_values), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + v0_values_precommit), + h0_values_precommit), + framework::dataset::make("interleave_lhs", { false })), + framework::dataset::make("interleave_rhs", { false })), + framework::dataset::make("export_to_cl_image_rhs", false)), + framework::dataset::make("DataType", DataType::F32)), + a_values_precommit), + beta_values_precommit), + framework::dataset::make("broadcast_bias", { true } )), + lhs_transpose_values), + act_values), + post_op_lists) + ) +{ + // Validate output + validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); +} + +TEST_SUITE_END() // FusedPostOps + TEST_SUITE(ExportToCLImage) DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip( framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32), // OK or incorrect if cl_khr_image2d_from_buffer not supported @@ -704,6 +915,45 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture, framework::ARM_COMPUTE_PRINT_INFO(); } } +TEST_SUITE(FusedPostOps) + +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_values, + n_values), + k_values), + b_values), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + v0_values_precommit), + h0_values_precommit), + framework::dataset::make("interleave_lhs", { false })), + framework::dataset::make("interleave_rhs", { false })), + framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("DataType", DataType::F32)), + a_values_precommit), + beta_values_precommit), + framework::dataset::make("broadcast_bias", { true } )), + lhs_transpose_values), + act_values), + post_op_lists) + ) +{ + // Validate output only if validate() is successful + if(validate_result) + { + validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); + } + else + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + } +} + +TEST_SUITE_END() // FusedPostOps + TEST_SUITE_END() // ExportToCLImage TEST_SUITE_END() // FP32 @@ -809,6 +1059,37 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture, validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16); } +TEST_SUITE(FusedPostOps) + +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_values, + n_values), + k_values), + b_values), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + v0_values_precommit), + h0_values_precommit), + framework::dataset::make("interleave_lhs", { false })), + framework::dataset::make("interleave_rhs", { false })), + framework::dataset::make("export_to_cl_image_rhs", false)), + framework::dataset::make("DataType", DataType::F16)), + a_values_precommit), + beta_values_precommit), + framework::dataset::make("broadcast_bias", { true } )), + lhs_transpose_values), + act_values), + post_op_lists) + ) +{ + // Validate output + validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16); +} + +TEST_SUITE_END() // FusedPostOps + TEST_SUITE(ExportToCLImage) DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip( framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16), // OK or incorrect if cl_khr_image2d_from_buffer not supported @@ -1075,6 +1356,45 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture, framework::ARM_COMPUTE_PRINT_INFO(); } } +TEST_SUITE(FusedPostOps) + +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_values, + n_values), + k_values), + b_values), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + v0_values_precommit), + h0_values_precommit), + framework::dataset::make("interleave_lhs", { false })), + framework::dataset::make("interleave_rhs", { false })), + framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("DataType", DataType::F16)), + a_values_precommit), + beta_values_precommit), + framework::dataset::make("broadcast_bias", { true } )), + lhs_transpose_values), + act_values), + post_op_lists) + ) +{ + // Validate output only if validate() is successful + if(validate_result) + { + validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16); + } + else + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + } +} + +TEST_SUITE_END() // FusedPostOps + TEST_SUITE_END() // ExportToCLImage TEST_SUITE_END() // FP16 @@ -1179,6 +1499,38 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionF // Validate output validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision); } + +TEST_SUITE(FusedPostOps) + +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_values, + n_values), + k_values), + b_values), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + v0_values_precommit), + h0_values_precommit), + framework::dataset::make("interleave_lhs", { false })), + framework::dataset::make("interleave_rhs", { false })), + framework::dataset::make("export_to_cl_image_rhs", { true, false })), + framework::dataset::make("DataType", DataType::F16)), + a_values_precommit), + beta_values_precommit), + framework::dataset::make("broadcast_bias", { true } )), + lhs_transpose_values), + act_values), + post_op_lists) + ) +{ + // Validate output + validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision); +} + +TEST_SUITE_END() // FusedPostOps + TEST_SUITE_END() // MixedPrecision TEST_SUITE_END() // Float TEST_SUITE_END() // GEMMMatrixMultiplyReshaped diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h index 5f5fa3b653..e1191587d5 100644 --- a/tests/validation/fixtures/GEMMFixture.h +++ b/tests/validation/fixtures/GEMMFixture.h @@ -27,6 +27,8 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/experimental/IPostOp.h" +#include "src/core/experimental/PostOp.h" #include "tests/AssetsLibrary.h" #include "tests/Globals.h" #include "tests/IAccessor.h" @@ -34,7 +36,9 @@ #include "tests/framework/Fixture.h" #include "tests/validation/Helpers.h" #include "tests/validation/reference/ActivationLayer.h" +#include "tests/validation/reference/ElementwiseOperations.h" #include "tests/validation/reference/GEMM.h" +#include "tests/validation/reference/PostOps.h" #include @@ -915,6 +919,263 @@ protected: SimpleTensor _reference{}; }; +/** (EXPERIMENTAL_POST_OPS)*/ +template +class GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture : public framework::Fixture +{ +public: + using PostOpArgBroadcast = std::tuple; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument +public: + template + void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool interleave_lhs, + bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info, + const experimental::PostOpList &post_ops) + { + GEMMLHSMatrixInfo lhs_info; + lhs_info.m0 = m0; + lhs_info.k0 = k0; + lhs_info.v0 = v0; + lhs_info.interleave = interleave_lhs; + lhs_info.transpose = lhs_transpose; + + GEMMRHSMatrixInfo rhs_info; + rhs_info.n0 = n0; + rhs_info.k0 = k0; + rhs_info.h0 = h0; + rhs_info.interleave = interleave_rhs; + rhs_info.transpose = !lhs_transpose; + rhs_info.export_to_cl_image = export_to_cl_image; + + // Set the tensor shapes for LHS and RHS matrices + const TensorShape lhs_shape(k, m, batch_size); + const TensorShape rhs_shape(n, k, batch_size); + const TensorShape bias_shape(n, + broadcast_bias ? 1 : m, + broadcast_bias ? 1 : batch_size); + auto post_ops_with_shapes = experimental::transform_post_op_list_arguments(post_ops, + [ = ](auto broadcast) + { + return TensorShape + { + std::get<0>(broadcast) ? 1 : n, + std::get<1>(broadcast) ? 1 : m, + std::get<2>(broadcast) ? 1 : batch_size, + }; + }); + + _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes); + if(validate_result) + { + _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes); + } + } + +protected: + template + void fill(U &&tensor, int i) + { + static_assert(std::is_floating_point::value || std::is_same::value, "Only floating point data types supported."); + using DistributionType = typename std::conditional::value, arm_compute::utils::uniform_real_distribution_16bit, std::uniform_real_distribution>::type; + + DistributionType distribution{ T(-1.0f), T(1.0f) }; + library->fill(tensor, distribution, i); + + // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0) + DistributionType distribution_inf{ T(std::numeric_limits::infinity()), T(std::numeric_limits::infinity()) }; + library->fill_borders_with_garbage(tensor, distribution_inf, i); + } + + TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, + DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList &post_ops) + { + // Create tensors + TensorType lhs = create_tensor(lhs_shape, data_type, 1); + TensorType rhs = create_tensor(rhs_shape, data_type, 1); + TensorType bias = create_tensor(bias_shape, data_type, 1); + + // Create post op tensors and populate post op with them + std::vector post_op_tensors_holder{}; + auto populated_post_ops = experimental::transform_post_op_list_arguments(post_ops, + [&post_op_tensors_holder, &data_type](auto shape) + { + auto t = create_tensor(shape, data_type, 1); + post_op_tensors_holder.push_back(std::move(t)); + return post_op_tensors_holder.back().info(); + }); + TensorType lhs_reshaped; + TensorType rhs_reshaped; + TensorType dst; + + const unsigned int M = lhs_shape[1]; + const unsigned int N = rhs_shape[0]; + const unsigned int K = lhs_shape[0]; + GEMMKernelInfo kernel_info; + kernel_info.m = M; + kernel_info.n = N; + kernel_info.k = K; + kernel_info.depth_output_gemm3d = 0; + kernel_info.reinterpret_input_as_3d = false; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = act_info; + kernel_info.fp_mixed_precision = fp_mixed_precision; + kernel_info.post_ops = populated_post_ops; + + // The output tensor will be auto-initialized within the function + + // Create and configure function + ReshapeLHSOperatorType reshape_lhs; + ReshapeRHSOperatorType reshape_rhs; + GEMMOperatorType gemm; + + validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info)); + validate_result = validate_result || !rhs_info.export_to_cl_image; + if(!validate_result) + { + return nullptr; + } + + reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); + reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info); + + ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(bias.info()->is_resizable()); + for(const auto &tensor : post_op_tensors_holder) + { + ARM_COMPUTE_ASSERT(tensor.info()->is_resizable()); + } + + // We do not pad when using image as it needs to comply to strict pitch alignment restrictions + if(!rhs_info.export_to_cl_image) + { + add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst }); + for(auto &tensor : post_op_tensors_holder) + { + add_padding_x({ &tensor }); + } + } + + // Allocate tensors + lhs.allocator()->allocate(); + rhs.allocator()->allocate(); + lhs_reshaped.allocator()->allocate(); + rhs_reshaped.allocator()->allocate(); + bias.allocator()->allocate(); + dst.allocator()->allocate(); + for(auto &tensor : post_op_tensors_holder) + { + tensor.allocator()->allocate(); + } + + ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!bias.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!dst.info()->is_resizable()); + for(const auto &tensor : post_op_tensors_holder) + { + ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable()); + } + + // Fill tensors + fill(AccessorType(lhs), 0); + fill(AccessorType(rhs), 1); + fill(AccessorType(bias), 2); + for(size_t i = 0; i < post_op_tensors_holder.size(); ++i) + { + fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i); + } + + // Compute GEMM + ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } }; + reshape_lhs.run(reshape_lhs_pack); + ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } }; + reshape_rhs.run(reshape_rhs_pack); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, + { ACL_SRC_1, &rhs_reshaped }, + { ACL_SRC_2, &bias }, + { ACL_DST, &dst } + }); + for(size_t i = 0; i < post_op_tensors_holder.size(); ++i) + { + gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i)); + } + gemm.run(gemm_pack); + + return dst; + } + + SimpleTensor compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias, + const ActivationLayerInfo &act_info, const experimental::PostOpList &post_ops) + { + TensorShape dst_shape = lhs_shape; + dst_shape[0] = rhs_shape[0]; + dst_shape[1] = lhs_shape[1]; + + // Create reference + SimpleTensor lhs{ lhs_shape, data_type, 1 }; + SimpleTensor rhs{ rhs_shape, data_type, 1 }; + SimpleTensor bias{ dst_shape, data_type, 1 }; + // Create post op tensors and populate post op with them + auto populated_post_ops = experimental::transform_post_op_list_arguments>(post_ops, [&data_type](auto shape) + { + return SimpleTensor { shape, data_type, 1 }; + }); + + const int n = rhs_shape[0]; + const int m = lhs_shape[1]; + const int batch_size = lhs_shape[2]; + + // Fill reference + int tensor_idx = 0; + fill(lhs, tensor_idx++); + fill(rhs, tensor_idx++); + fill(bias, tensor_idx++); + for(auto &op : populated_post_ops.get_list()) + { + for(auto tensor : op->arguments()) + { + fill(*tensor, tensor_idx++); + } + } + + if(broadcast_bias) + { + // In case of broadcast, we need simply copy the first into the following "M" ones + for(int i = 1; i < m * batch_size; i++) + { + memcpy(bias.data() + i * n, bias.data(), n * sizeof(T)); + } + } + + SimpleTensor out; + if(fp_mixed_precision) + { + out = reference::gemm_mixed_precision(lhs, rhs, bias, alpha, beta); + } + else + { + out = reference::gemm(lhs, rhs, bias, alpha, beta); + } + // Ignore activation info if post ops are used instead + if(populated_post_ops.size() > 0) + { + out = reference::post_ops(out, populated_post_ops); + } + else + { + out = reference::activation_layer(out, act_info); + } + return out; + } + + bool validate_result = true; + TensorType _target{}; + SimpleTensor _reference{}; +}; + template class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture { diff --git a/tests/validation/reference/PostOps.cpp b/tests/validation/reference/PostOps.cpp new file mode 100644 index 0000000000..1a8fb990c8 --- /dev/null +++ b/tests/validation/reference/PostOps.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "PostOps.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "src/core/experimental/PostOp.h" +#include "tests/validation/reference/ActivationLayer.h" +#include "tests/validation/reference/ElementwiseOperations.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace reference +{ +template ::value, int>::type> +SimpleTensor post_ops(const SimpleTensor &a, experimental::PostOpList> post_ops) +{ + // Create reference + SimpleTensor dst{ a }; + + for(auto &post_op : post_ops.get_list()) + { + switch(post_op->type()) + { + case experimental::PostOpType::Activation: + { + const auto _post_op = utils::cast::polymorphic_downcast> *>(post_op.get()); + dst = reference::activation_layer(dst, _post_op->_act_info); + break; + } + case experimental::PostOpType::Eltwise_Add: + { + const auto _post_op = utils::cast::polymorphic_downcast> *>(post_op.get()); + dst = reference::arithmetic_operation(ArithmeticOperation::ADD, dst, _post_op->_addend, dst, _post_op->_policy); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported PostOpType"); + } + } + } + return dst; +} + +template SimpleTensor post_ops(const SimpleTensor &a, experimental::PostOpList> post_ops); +template SimpleTensor post_ops(const SimpleTensor &a, experimental::PostOpList> post_ops); +} // namespace reference +} // namespace validation +} // namespace test +} // namespace arm_compute \ No newline at end of file diff --git a/tests/validation/reference/PostOps.h b/tests/validation/reference/PostOps.h new file mode 100644 index 0000000000..5fe0fe71f5 --- /dev/null +++ b/tests/validation/reference/PostOps.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_TEST_POSTOPS_H +#define ARM_COMPUTE_TEST_POSTOPS_H + +#include "arm_compute/core/experimental/IPostOp.h" +#include "tests/SimpleTensor.h" +#include "tests/validation/Helpers.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace reference +{ +/** (EXPERIMENTAL_POST_OPS) */ +template ::value, int>::type = 0> +SimpleTensor post_ops(const SimpleTensor &a, experimental::PostOpList> post_ops); + +} // namespace reference +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif /* ARM_COMPUTE_TEST_POSTOPS_H */ diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h index 220c3ac189..64694f0e7c 100644 --- a/utils/TypePrinter.h +++ b/utils/TypePrinter.h @@ -36,12 +36,13 @@ #include "arm_compute/core/Strides.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/experimental/IPostOp.h" #include "arm_compute/runtime/CL/CLTunerTypes.h" #include "arm_compute/runtime/CL/CLTypes.h" #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/common/LSTMParams.h" +#include "src/core/experimental/PostOp.h" #include "support/StringSupport.h" - #include #include #include @@ -135,6 +136,133 @@ std::string to_string(const std::vector &args) return str.str(); } +/** @name (EXPERIMENTAL_POST_OPS) + * @{ + */ +/** Formmated output of the @ref experimental::PostOpType type + * + * @param[out] os Output stream. + * @param[in] post_op_type Type to output. + * + * @return Modified output stream. + */ +inline ::std::ostream &operator<<(::std::ostream &os, experimental::PostOpType post_op_type) +{ + os << "type="; + switch(post_op_type) + { + case experimental::PostOpType::Activation: + { + os << "Activation"; + break; + } + case experimental::PostOpType::Eltwise_Add: + { + os << "Eltwise_Add"; + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported PostOpType"); + break; + } + } + return os; +} +/** Converts a @ref experimental::PostOpType to string + * + * @param[in] post_op_type PostOpType value to be converted + * + * @return String representing the corresponding PostOpType + */ +inline std::string to_string(experimental::PostOpType post_op_type) +{ + std::stringstream str; + str << post_op_type; + return str.str(); +} +/** Formatted output of the @ref experimental::IPostOp type. + * + * @param[out] os Output stream. + * @param[in] post_op Type to output. + * + * @return Modified output stream. + */ +template +inline ::std::ostream &operator<<(::std::ostream &os, const experimental::IPostOp &post_op) +{ + os << "<"; + os << post_op.type() << ","; + os << "prev_op_arg_pos=" << post_op.prev_dst_pos() << ","; + switch(post_op.type()) + { + case experimental::PostOpType::Activation: + { + const auto _post_op = utils::cast::polymorphic_downcast *>(&post_op); + os << "act_info=" << &(_post_op->_act_info); + break; + } + case experimental::PostOpType::Eltwise_Add: + { + const auto _post_op = utils::cast::polymorphic_downcast *>(&post_op); + os << "convert_policy=" << _post_op->_policy; + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported PostOpType"); + break; + } + } + os << ">"; + return os; +} +/** Converts an @ref experimental::IPostOp to string + * + * @param[in] post_op IPostOp value to be converted + * + * @return String representing the corresponding IPostOp + */ +template +inline std::string to_string(const experimental::IPostOp &post_op) +{ + std::stringstream str; + str << post_op; + return str.str(); +} +/** Formatted output of the @ref experimental::PostOpList type. + * + * @param[out] os Output stream. + * @param[in] post_ops Type to output. + * + * @return Modified output stream. + */ +template +inline ::std::ostream &operator<<(::std::ostream &os, const experimental::PostOpList &post_ops) +{ + os << "["; + for(const auto &post_op : post_ops.get_list()) + { + os << *post_op << ","; + } + os << "]"; + return os; +} +/** Converts a @ref experimental::PostOpList to string + * + * @param[in] post_ops PostOpList value to be converted + * + * @return String representing the corresponding PostOpList + */ +template +inline std::string to_string(const experimental::PostOpList &post_ops) +{ + std::stringstream str; + str << post_ops; + return str.str(); +} +/** @} */ // end of group (EXPERIMENTAL_POST_OPS) + /** Formatted output of the Dimensions type. * * @param[out] os Output stream. @@ -244,8 +372,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMKernelInfo &gemm os << " fp_mixed_precision= " << gemm_info.fp_mixed_precision; os << " mult_transpose1xW_width= " << gemm_info.mult_transpose1xW_width; os << " mult_interleave4x4_height= " << gemm_info.mult_interleave4x4_height; - os << " a_offset = " << gemm_info.a_offset; - os << " b_offset = " << gemm_info.b_offset; + os << " a_offset= " << gemm_info.a_offset; + os << " b_offset= " << gemm_info.b_offset; + os << "post_ops= " << gemm_info.post_ops; os << ")"; return os; } @@ -487,7 +616,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo: /** Formatted output of the activation function info type. * - * @param[in] info Type to output. + * @param[in] info ActivationLayerInfo to output. * * @return Formatted string. */ @@ -501,25 +630,35 @@ inline std::string to_string(const arm_compute::ActivationLayerInfo &info) return str.str(); } -/** Formatted output of the activation function info type. +/** Formatted output of the activation function info. * - * @param[in] info Type to output. + * @param[out] os Output stream. + * @param[in] info ActivationLayerInfo to output. * * @return Formatted string. */ -inline std::string to_string(const arm_compute::ActivationLayerInfo *info) +inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo *info) { - std::string ret_str = "nullptr"; if(info != nullptr) { - std::stringstream str; if(info->enabled()) { - str << info->activation(); + os << info->activation(); + os << "("; + os << "VAL_A=" << info->a() << ","; + os << "VAL_B=" << info->b(); + os << ")"; + } + else + { + os << "disabled"; } - ret_str = str.str(); } - return ret_str; + else + { + os << "nullptr"; + } + return os; } /** Formatted output of the activation function type. -- cgit v1.2.1