aboutsummaryrefslogtreecommitdiff
path: root/arm_compute
diff options
context:
space:
mode:
authorSiCongLi <sicong.li@arm.com>2021-10-06 15:25:57 +0100
committerSiCong Li <sicong.li@arm.com>2021-10-28 11:00:52 +0000
commit1af5416917268692fcd4b34b1d7ffebd3a2aea8a (patch)
tree81833ecad401eeb0101fb0d464728df8b699caf8 /arm_compute
parent49956ccf029ff4c1873e3a6702b5bede95d81f7a (diff)
downloadComputeLibrary-1af5416917268692fcd4b34b1d7ffebd3a2aea8a.tar.gz
Add experimental PostOp interface to ClGemmMatrixMultiplyReshapedKernel Part 1
This interface supports the fusion of multiple elementwise operations Partially resolves: COMPMID-4435 Change-Id: If68dd7dd98dcf239fde7cb1f0a4a6d4d1e899a6f Signed-off-by: SiCongLi <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6483 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'arm_compute')
-rw-r--r--arm_compute/core/KernelDescriptors.h69
-rw-r--r--arm_compute/core/Types.h6
-rw-r--r--arm_compute/core/experimental/IPostOp.h162
-rw-r--r--arm_compute/core/experimental/Types.h5
4 files changed, 209 insertions, 33 deletions
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index a6e5c3372e..bc6ec1f6c5 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/IPostOp.h"
namespace arm_compute
{
@@ -52,48 +53,52 @@ struct FFTRadixStageKernelInfo
bool is_first_stage{ false }; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */
};
+class ITensorInfo;
/** Descriptor used by the GEMM kernels */
struct GEMMKernelInfo
{
GEMMKernelInfo() = default;
GEMMKernelInfo(
- unsigned int im,
- unsigned int in,
- unsigned int ik,
- unsigned int idepth_output_gemm3d,
- bool ireinterpret_input_as_3d,
- bool ibroadcast_bias,
- bool ifp_mixed_precision,
- bool ihas_pad_y,
- ActivationLayerInfo iactivation_info,
- int inmult_transpose1xW_width,
- int imult_interleave4x4_height,
- GEMMLHSMatrixInfo ilhs_info,
- GEMMRHSMatrixInfo irhs_info,
- int32_t ina_offset,
- int32_t inb_offset)
+ unsigned int im,
+ unsigned int in,
+ unsigned int ik,
+ unsigned int idepth_output_gemm3d,
+ bool ireinterpret_input_as_3d,
+ bool ibroadcast_bias,
+ bool ifp_mixed_precision,
+ bool ihas_pad_y,
+ ActivationLayerInfo iactivation_info,
+ int inmult_transpose1xW_width,
+ int imult_interleave4x4_height,
+ GEMMLHSMatrixInfo ilhs_info,
+ GEMMRHSMatrixInfo irhs_info,
+ int32_t ina_offset,
+ int32_t inb_offset,
+ const experimental::PostOpList<ITensorInfo *> &ipost_ops = experimental::PostOpList<ITensorInfo *> {})
: m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision),
has_pad_y(ihas_pad_y), activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info),
- rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset)
+ rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset), post_ops(ipost_ops)
{
}
- unsigned int m{ 0 }; /**< Number of LHS rows*/
- unsigned int n{ 0 }; /**< Number of RHS columns*/
- unsigned int k{ 0 }; /**< Number of LHS columns or RHS rows */
- unsigned int depth_output_gemm3d{ 0 }; /**< Depth of the output tensor in case is reinterpreted as 3D */
- bool reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
- bool broadcast_bias{ false }; /**< Flag used to broadcast the bias addition */
- bool fp_mixed_precision{ false }; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
- bool has_pad_y{ false }; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
- ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */
- int mult_transpose1xW_width{ 1 }; /**< Multiplication factor for the width of the 1xW transposed block */
- int mult_interleave4x4_height{ 1 }; /**< Multiplication factor for the height of the 4x4 interleaved block */
- GEMMLHSMatrixInfo lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */
- GEMMRHSMatrixInfo rhs_info{}; /**< RHS matrix information used for reshaping the RHS matrix */
- int32_t a_offset{ 0 }; /**< Offset to be added to each element of the matrix A */
- int32_t b_offset{ 0 }; /**< Offset to be added to each element of the matrix B */
- GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */
+ unsigned int m{ 0 }; /**< Number of LHS rows*/
+ unsigned int n{ 0 }; /**< Number of RHS columns*/
+ unsigned int k{ 0 }; /**< Number of LHS columns or RHS rows */
+ unsigned int depth_output_gemm3d{ 0 }; /**< Depth of the output tensor in case is reinterpreted as 3D */
+ bool reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
+ bool broadcast_bias{ false }; /**< Flag used to broadcast the bias addition */
+ bool fp_mixed_precision{ false }; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
+ bool has_pad_y{ false }; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
+ ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */
+ int mult_transpose1xW_width{ 1 }; /**< Multiplication factor for the width of the 1xW transposed block */
+ int mult_interleave4x4_height{ 1 }; /**< Multiplication factor for the height of the 4x4 interleaved block */
+ GEMMLHSMatrixInfo lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */
+ GEMMRHSMatrixInfo rhs_info{}; /**< RHS matrix information used for reshaping the RHS matrix */
+ int32_t a_offset{ 0 }; /**< Offset to be added to each element of the matrix A */
+ int32_t b_offset{ 0 }; /**< Offset to be added to each element of the matrix B */
+ GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */
+ experimental::PostOpList<ITensorInfo *> post_ops{}; /**< (EXPERIMENTAL_POST_OPS) Specifies a list of post ops to be fused after the main op. Note unsupported post ops would not be executed.
+ * If specified, automatically disable the @ref activation_info */
};
/** Compute descriptor used by the depthwise convolution native kernel */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index b2b09825c1..bfe85ea937 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -382,7 +382,11 @@ struct BorderSize
/** Container for 2D padding size */
using PaddingSize = BorderSize;
-/** Policy to handle overflow */
+/** Policy to handle integer overflow
+ * @note: This is ignored by floating point operations where the overflow behavior adheres to the IEEE-754 standard
+ * which states that in case of overflow ±infinity is returned for the round-to-nearest modes (and follows the
+ * rounding rules for the directed rounding modes) by default.
+ */
enum class ConvertPolicy
{
WRAP, /**< Wrap around */
diff --git a/arm_compute/core/experimental/IPostOp.h b/arm_compute/core/experimental/IPostOp.h
new file mode 100644
index 0000000000..cd6b8fc4cc
--- /dev/null
+++ b/arm_compute/core/experimental/IPostOp.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
+#define ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
+
+#include <memory>
+#include <numeric>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+/** Type of Post Op */
+enum class PostOpType
+{
+ Activation,
+ Eltwise_Add,
+};
+/** An ordered sequence of type of Post Ops */
+using PostOpTypeSequence = std::vector<PostOpType>;
+/** An elementwise n-ary operation that can be appended to and fused with (at kernel-level) other operators
+ * It contains:
+ * 1. The attributes of the original operator.
+ * 2. Any additional tensor argument.
+ * 3. The postion of the previous op's dst tensor in its argument list ( @ref prev_dst_pos )
+ *
+ * For example, a series of chained ops:
+ *
+ * div(src1, relu(conv(src0, weights, bias, conv_info), act_info), div_info)
+ *
+ * translates to
+ *
+ * dst = conv(src0, weights, bias, conv_info) // main op
+ * dst = relu(dst, act_info) // previous dst is placed in the first (and only) argument
+ * dst = div(src1, dst, div_info) // previous dst is placed in the second argument
+ *
+ * which in turn translates to:
+ *
+ * main op: conv(src0, weights, bias, conv_info)
+ * post op1: relu(act_info, prev_dst_pos = 0)
+ * post op2: div(div_info, src1, prev_dst_pos = 1)
+ *
+ * NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type
+ * NOTE: If TensorRelatedT points to a resource, IPostOp assumes that resource is valid throughout its lifetime
+ * and the lifetime of its copies. This is almost guaranteed as IPostOp is only meant to be used at configure time
+ * after the ITensor or ITensorInfo objects are already constructed
+ */
+template <typename TensorRelatedT>
+struct IPostOp
+{
+ /** Get the arity of the post op
+ * NOTE: that this is one fewer than the arity of the original op, because we implicitly pass the previous op's dst
+ * tensor as one of the arguments
+ */
+ size_t arity() const
+ {
+ return arguments().size();
+ }
+ /** The position of previous op's dst in current op's argument list */
+ virtual int prev_dst_pos() const = 0;
+ /** The IPostOp type */
+ virtual PostOpType type() const = 0;
+ /** The argument tensors
+ * The order of the argument tensor is strictly preserved
+ */
+ virtual std::vector<TensorRelatedT *> arguments() = 0;
+ virtual std::vector<const TensorRelatedT *> arguments() const = 0;
+ /** Clone method used in cases where PostOps are owned by unique_ptr
+ * NOTE: This performs a shallow copy of the TensorRelatedT if TensorRelatedT points to a resource
+ */
+ virtual std::unique_ptr<IPostOp<TensorRelatedT>> clone() const = 0;
+ virtual ~IPostOp()
+ {
+ }
+};
+
+/** A sequence of PostOps that can be appended to the end of other operators */
+template <typename TensorRelatedT>
+class PostOpList
+{
+public:
+ /** Constructor */
+ PostOpList() = default;
+ /** Destructor */
+ ~PostOpList() = default;
+ PostOpList(const PostOpList &other)
+ {
+ for(const auto &op : other._post_ops)
+ {
+ this->_post_ops.push_back(op->clone());
+ }
+ }
+ PostOpList &operator=(const PostOpList &other)
+ {
+ PostOpList tmp{ other };
+ std::swap(tmp, *this);
+ return *this;
+ }
+ PostOpList(PostOpList &&other) = default;
+ PostOpList &operator=(PostOpList &&other) = default;
+
+ /** Add a new post op at the end of the list */
+ template <typename OpT, typename... Args>
+ void push_back_op(Args &&... args)
+ {
+ _post_ops.push_back(std::make_unique<OpT>(std::forward<Args>(args)...));
+ }
+
+ /** Number of post ops */
+ size_t size() const
+ {
+ return _post_ops.size();
+ }
+
+ /** Total number of post ops */
+ size_t total_num_arguments() const
+ {
+ return std::accumulate(_post_ops.begin(), _post_ops.end(), 0, [](size_t op1_arity, const auto & op2)
+ {
+ return op1_arity + op2->arity();
+ });
+ }
+
+ /** Get the underlying post op list */
+ std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list()
+ {
+ return _post_ops;
+ }
+ const std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list() const
+ {
+ return _post_ops;
+ }
+
+private:
+ std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> _post_ops{};
+};
+
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_IPOSTOP \ No newline at end of file
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
index a478513b1b..c8755dc26c 100644
--- a/arm_compute/core/experimental/Types.h
+++ b/arm_compute/core/experimental/Types.h
@@ -76,6 +76,11 @@ enum TensorType : int32_t
ACL_VEC_COL_SUM = ACL_SRC_4,
ACL_SHIFTS = ACL_SRC_5,
ACL_MULTIPLIERS = ACL_SRC_6,
+
+ // (EXPERIMENTAL_POST_OPS) Post ops arguments begin after everything else
+ EXPERIMENTAL_ACL_POST_OP_ARG = 2048,
+ EXPERIMENTAL_ACL_POST_OP_ARG_FIRST = EXPERIMENTAL_ACL_POST_OP_ARG,
+ EXPERIMENTAL_ACL_POST_OP_ARG_LAST = EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + 1024, // Max number of post op arguments
};
namespace experimental