diff options
author | SiCongLi <sicong.li@arm.com> | 2021-10-06 15:25:57 +0100 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2021-10-28 11:00:52 +0000 |
commit | 1af5416917268692fcd4b34b1d7ffebd3a2aea8a (patch) | |
tree | 81833ecad401eeb0101fb0d464728df8b699caf8 /arm_compute | |
parent | 49956ccf029ff4c1873e3a6702b5bede95d81f7a (diff) | |
download | ComputeLibrary-1af5416917268692fcd4b34b1d7ffebd3a2aea8a.tar.gz |
Add experimental PostOp interface to ClGemmMatrixMultiplyReshapedKernel Part 1
This interface supports the fusion of multiple elementwise operations
Partially resolves: COMPMID-4435
Change-Id: If68dd7dd98dcf239fde7cb1f0a4a6d4d1e899a6f
Signed-off-by: SiCongLi <sicong.li@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6483
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'arm_compute')
-rw-r--r-- | arm_compute/core/KernelDescriptors.h | 69 | ||||
-rw-r--r-- | arm_compute/core/Types.h | 6 | ||||
-rw-r--r-- | arm_compute/core/experimental/IPostOp.h | 162 | ||||
-rw-r--r-- | arm_compute/core/experimental/Types.h | 5 |
4 files changed, 209 insertions, 33 deletions
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h index a6e5c3372e..bc6ec1f6c5 100644 --- a/arm_compute/core/KernelDescriptors.h +++ b/arm_compute/core/KernelDescriptors.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/experimental/IPostOp.h" namespace arm_compute { @@ -52,48 +53,52 @@ struct FFTRadixStageKernelInfo bool is_first_stage{ false }; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */ }; +class ITensorInfo; /** Descriptor used by the GEMM kernels */ struct GEMMKernelInfo { GEMMKernelInfo() = default; GEMMKernelInfo( - unsigned int im, - unsigned int in, - unsigned int ik, - unsigned int idepth_output_gemm3d, - bool ireinterpret_input_as_3d, - bool ibroadcast_bias, - bool ifp_mixed_precision, - bool ihas_pad_y, - ActivationLayerInfo iactivation_info, - int inmult_transpose1xW_width, - int imult_interleave4x4_height, - GEMMLHSMatrixInfo ilhs_info, - GEMMRHSMatrixInfo irhs_info, - int32_t ina_offset, - int32_t inb_offset) + unsigned int im, + unsigned int in, + unsigned int ik, + unsigned int idepth_output_gemm3d, + bool ireinterpret_input_as_3d, + bool ibroadcast_bias, + bool ifp_mixed_precision, + bool ihas_pad_y, + ActivationLayerInfo iactivation_info, + int inmult_transpose1xW_width, + int imult_interleave4x4_height, + GEMMLHSMatrixInfo ilhs_info, + GEMMRHSMatrixInfo irhs_info, + int32_t ina_offset, + int32_t inb_offset, + const experimental::PostOpList<ITensorInfo *> &ipost_ops = experimental::PostOpList<ITensorInfo *> {}) : m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision), has_pad_y(ihas_pad_y), activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info), - rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset) + rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset), post_ops(ipost_ops) { } - unsigned int m{ 0 }; /**< Number of LHS rows*/ - unsigned int n{ 0 }; /**< Number of RHS columns*/ - unsigned int k{ 0 }; /**< Number of LHS columns or RHS rows */ - unsigned int depth_output_gemm3d{ 0 }; /**< Depth of the output tensor in case is reinterpreted as 3D */ - bool reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */ - bool broadcast_bias{ false }; /**< Flag used to broadcast the bias addition */ - bool fp_mixed_precision{ false }; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */ - bool has_pad_y{ false }; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */ - ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */ - int mult_transpose1xW_width{ 1 }; /**< Multiplication factor for the width of the 1xW transposed block */ - int mult_interleave4x4_height{ 1 }; /**< Multiplication factor for the height of the 4x4 interleaved block */ - GEMMLHSMatrixInfo lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */ - GEMMRHSMatrixInfo rhs_info{}; /**< RHS matrix information used for reshaping the RHS matrix */ - int32_t a_offset{ 0 }; /**< Offset to be added to each element of the matrix A */ - int32_t b_offset{ 0 }; /**< Offset to be added to each element of the matrix B */ - GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */ + unsigned int m{ 0 }; /**< Number of LHS rows*/ + unsigned int n{ 0 }; /**< Number of RHS columns*/ + unsigned int k{ 0 }; /**< Number of LHS columns or RHS rows */ + unsigned int depth_output_gemm3d{ 0 }; /**< Depth of the output tensor in case is reinterpreted as 3D */ + bool reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */ + bool broadcast_bias{ false }; /**< Flag used to broadcast the bias addition */ + bool fp_mixed_precision{ false }; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */ + bool has_pad_y{ false }; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */ + ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */ + int mult_transpose1xW_width{ 1 }; /**< Multiplication factor for the width of the 1xW transposed block */ + int mult_interleave4x4_height{ 1 }; /**< Multiplication factor for the height of the 4x4 interleaved block */ + GEMMLHSMatrixInfo lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */ + GEMMRHSMatrixInfo rhs_info{}; /**< RHS matrix information used for reshaping the RHS matrix */ + int32_t a_offset{ 0 }; /**< Offset to be added to each element of the matrix A */ + int32_t b_offset{ 0 }; /**< Offset to be added to each element of the matrix B */ + GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */ + experimental::PostOpList<ITensorInfo *> post_ops{}; /**< (EXPERIMENTAL_POST_OPS) Specifies a list of post ops to be fused after the main op. Note unsupported post ops would not be executed. + * If specified, automatically disable the @ref activation_info */ }; /** Compute descriptor used by the depthwise convolution native kernel */ diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index b2b09825c1..bfe85ea937 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -382,7 +382,11 @@ struct BorderSize /** Container for 2D padding size */ using PaddingSize = BorderSize; -/** Policy to handle overflow */ +/** Policy to handle integer overflow + * @note: This is ignored by floating point operations where the overflow behavior adheres to the IEEE-754 standard + * which states that in case of overflow ±infinity is returned for the round-to-nearest modes (and follows the + * rounding rules for the directed rounding modes) by default. + */ enum class ConvertPolicy { WRAP, /**< Wrap around */ diff --git a/arm_compute/core/experimental/IPostOp.h b/arm_compute/core/experimental/IPostOp.h new file mode 100644 index 0000000000..cd6b8fc4cc --- /dev/null +++ b/arm_compute/core/experimental/IPostOp.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_IPOSTOP +#define ARM_COMPUTE_EXPERIMENTAL_IPOSTOP + +#include <memory> +#include <numeric> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +/** Type of Post Op */ +enum class PostOpType +{ + Activation, + Eltwise_Add, +}; +/** An ordered sequence of type of Post Ops */ +using PostOpTypeSequence = std::vector<PostOpType>; +/** An elementwise n-ary operation that can be appended to and fused with (at kernel-level) other operators + * It contains: + * 1. The attributes of the original operator. + * 2. Any additional tensor argument. + * 3. The postion of the previous op's dst tensor in its argument list ( @ref prev_dst_pos ) + * + * For example, a series of chained ops: + * + * div(src1, relu(conv(src0, weights, bias, conv_info), act_info), div_info) + * + * translates to + * + * dst = conv(src0, weights, bias, conv_info) // main op + * dst = relu(dst, act_info) // previous dst is placed in the first (and only) argument + * dst = div(src1, dst, div_info) // previous dst is placed in the second argument + * + * which in turn translates to: + * + * main op: conv(src0, weights, bias, conv_info) + * post op1: relu(act_info, prev_dst_pos = 0) + * post op2: div(div_info, src1, prev_dst_pos = 1) + * + * NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type + * NOTE: If TensorRelatedT points to a resource, IPostOp assumes that resource is valid throughout its lifetime + * and the lifetime of its copies. This is almost guaranteed as IPostOp is only meant to be used at configure time + * after the ITensor or ITensorInfo objects are already constructed + */ +template <typename TensorRelatedT> +struct IPostOp +{ + /** Get the arity of the post op + * NOTE: that this is one fewer than the arity of the original op, because we implicitly pass the previous op's dst + * tensor as one of the arguments + */ + size_t arity() const + { + return arguments().size(); + } + /** The position of previous op's dst in current op's argument list */ + virtual int prev_dst_pos() const = 0; + /** The IPostOp type */ + virtual PostOpType type() const = 0; + /** The argument tensors + * The order of the argument tensor is strictly preserved + */ + virtual std::vector<TensorRelatedT *> arguments() = 0; + virtual std::vector<const TensorRelatedT *> arguments() const = 0; + /** Clone method used in cases where PostOps are owned by unique_ptr + * NOTE: This performs a shallow copy of the TensorRelatedT if TensorRelatedT points to a resource + */ + virtual std::unique_ptr<IPostOp<TensorRelatedT>> clone() const = 0; + virtual ~IPostOp() + { + } +}; + +/** A sequence of PostOps that can be appended to the end of other operators */ +template <typename TensorRelatedT> +class PostOpList +{ +public: + /** Constructor */ + PostOpList() = default; + /** Destructor */ + ~PostOpList() = default; + PostOpList(const PostOpList &other) + { + for(const auto &op : other._post_ops) + { + this->_post_ops.push_back(op->clone()); + } + } + PostOpList &operator=(const PostOpList &other) + { + PostOpList tmp{ other }; + std::swap(tmp, *this); + return *this; + } + PostOpList(PostOpList &&other) = default; + PostOpList &operator=(PostOpList &&other) = default; + + /** Add a new post op at the end of the list */ + template <typename OpT, typename... Args> + void push_back_op(Args &&... args) + { + _post_ops.push_back(std::make_unique<OpT>(std::forward<Args>(args)...)); + } + + /** Number of post ops */ + size_t size() const + { + return _post_ops.size(); + } + + /** Total number of post ops */ + size_t total_num_arguments() const + { + return std::accumulate(_post_ops.begin(), _post_ops.end(), 0, [](size_t op1_arity, const auto & op2) + { + return op1_arity + op2->arity(); + }); + } + + /** Get the underlying post op list */ + std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list() + { + return _post_ops; + } + const std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list() const + { + return _post_ops; + } + +private: + std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> _post_ops{}; +}; + +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
\ No newline at end of file diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h index a478513b1b..c8755dc26c 100644 --- a/arm_compute/core/experimental/Types.h +++ b/arm_compute/core/experimental/Types.h @@ -76,6 +76,11 @@ enum TensorType : int32_t ACL_VEC_COL_SUM = ACL_SRC_4, ACL_SHIFTS = ACL_SRC_5, ACL_MULTIPLIERS = ACL_SRC_6, + + // (EXPERIMENTAL_POST_OPS) Post ops arguments begin after everything else + EXPERIMENTAL_ACL_POST_OP_ARG = 2048, + EXPERIMENTAL_ACL_POST_OP_ARG_FIRST = EXPERIMENTAL_ACL_POST_OP_ARG, + EXPERIMENTAL_ACL_POST_OP_ARG_LAST = EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + 1024, // Max number of post op arguments }; namespace experimental |