aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2018-01-12 10:21:40 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:44:21 +0000
commit36a0a4608bf413fc1fd65eb335bfb736ef602149 (patch)
tree2ff0e35dc9e16fedd601b1f24bdc13d25d075b90 /arm_compute/core
parent46edf63bd630f5e3f3eb31b7d4602caa317da075 (diff)
downloadComputeLibrary-36a0a4608bf413fc1fd65eb335bfb736ef602149.tar.gz
COMPMID-748 - Integrating optimized SGEMM for bifrost
This patch introduces a new GEMM capable to improve the mac utilisation of 10% compared to the GEMM without reshape. However this implementation is not faster in all cases as we need to take into account the time for reshaping the matrices. For this reason an heuristic solution to select the optimal GEMM to use has been added to the function. More information about the heuristic implementation can be found at COMPMID-852. With this new patch, GoogleNet, MobileNet, VGG16 and SqueezeNet can improved the performance of 1.5x. More information about the performance uplift can be found here: https://confluence.arm.com/display/MLENG/GEMM+FP32+performance%3A+ACL+18.02 Change-Id: I024563c06b9aed02a211a974e452bae5c233b04c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117140 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'arm_compute/core')
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h18
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h10
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h18
-rw-r--r--arm_compute/core/Types.h108
-rw-r--r--arm_compute/core/utils/misc/ShapeCalculator.h22
5 files changed, 140 insertions, 36 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
index 2520eff5de..c0fef45afe 100644
--- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,7 +47,7 @@ class ICLTensor;
* \end{array} \right)
* @f]
*
- * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ * After this operation, the output matrix will have the following shape: [ height * W, ceil(width / W) ] where W = 4 * mult_interleave4x4_height
*/
class CLGEMMInterleave4x4Kernel : public ICLKernel
{
@@ -64,18 +64,20 @@ public:
CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
- * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleave block
*/
- void configure(const ICLTensor *input, ICLTensor *output);
+ void configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel
*
- * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
- * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+ * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+ * @param[in] mult_interleave4x4_height Multiplication factor for the height of the 4x4 interleave block
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height);
// Inherited methods overridden
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 4e73d7eb13..7260c4a4f6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,8 +58,10 @@ public:
* @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
* @param[in] alpha Weight of the matrix product
* @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
+ * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
+ *
*/
- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true);
+ void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
*
* @param[in] input0 Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32
@@ -67,11 +69,13 @@ public:
* @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
* @param[in] alpha Weight of the matrix product
* @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
+ * @param[in] reshape_info GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
* @param[in] gpu_target GPU Target
*
* @return a status
*/
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target);
+ static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+ GPUTarget gpu_target);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
index 8721643c1e..9a3069eab6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,7 +62,7 @@ class ICLTensor;
* \end{array} \right)
* @f]
*
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width
*
*/
class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
@@ -70,18 +70,20 @@ class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
public:
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
- * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block
*/
- void configure(const ICLTensor *input, ICLTensor *output);
+ void configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMTranspose1xWKernel
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
- * @param[in] output Output tensor. Data type supported: same as @p input.
+ * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] output Output tensor. Data type supported: same as @p input.
+ * @param[in] mult_transpose1xW_width Multiplication factor for the width of the 1xW transposed block
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 5402e358b5..5197000bf9 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -824,13 +824,95 @@ private:
const unsigned int _num_kernels;
};
-/** GEMM Information class. This class stores the necessary information to compute GEMM functions */
+/** GEMM reshape information class. This class stores the necessary information about matrix A and matrix B reshape.
+ *
+ * The matrix A can only be reshaped through @ref CLGEMMInterleave4x4Kernel or @ref NEGEMMInterleave4x4Kernel or @ref GCGEMMInterleave4x4Kernel
+ * Note: Optionally just for @ref CLGEMMInterleave4x4Kernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block
+ *
+ * The matrix B can only be reshaped through @ref CLGEMMTranspose1xWKernel or @ref NEGEMMTranspose1xWKernel or @ref GCGEMMTranspose1xWKernel
+ * Note: Optionally just for @ref CLGEMMTranspose1xWKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block
+ *
+ */
+class GEMMReshapeInfo final
+{
+public:
+ /** Default constructor */
+ GEMMReshapeInfo()
+ : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1)
+ {
+ }
+ /** Constructor
+ *
+ * @param[in] m Number of matrix A rows
+ * @param[in] n Number of matrix B columns
+ * @param[in] k Number of matrix A columns or matrix B rows
+ * @param[in] mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block
+ * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleaved block
+ */
+ GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1)
+ : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height)
+ {
+ }
+ /** Number of matrix A rows
+ *
+ * @return the number of matrix A rows
+ */
+ int m() const
+ {
+ return _m;
+ }
+ /** Number of matrix B columns
+ *
+ * @return the number of matrix B columns
+ */
+ int n() const
+ {
+ return _n;
+ }
+ /** Number of matrix A columns or matrix B rows
+ *
+ * @return the number of matrix A columns or matrix B rows
+ */
+ int k() const
+ {
+ return _k;
+ }
+ /** Multiplication factor for the width of the 1xW transposed block
+ *
+ * @return the multiplication factor for the width of the 1xW transposed block
+ */
+ int mult_transpose1xW_width() const
+ {
+ return _mult_transpose1xW_width;
+ }
+ /** Multiplication factor for the height of the 4x4 interleaved block
+ *
+ * @return the multiplication factor for the height of the 4x4 interleaved block
+ */
+ int mult_interleave4x4_height() const
+ {
+ return _mult_interleave4x4_height;
+ }
+
+private:
+ const int _m;
+ const int _n;
+ const int _k;
+ const int _mult_transpose1xW_width;
+ const int _mult_interleave4x4_height;
+};
+
+/** GEMM information class. This class stores the necessary information to compute GEMM functions
+ *
+ * This object also contains the information about how matrix A and matrix B have been reshaped
+ *
+ */
class GEMMInfo
{
public:
/** Default constructor */
GEMMInfo()
- : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false)
+ : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false), _reshape_info()
{
}
/** Constructor
@@ -838,9 +920,10 @@ public:
* @param[in] is_a_reshaped True if the matrix A has been reshaped
* @param[in] is_b_reshaped True if the matrix B has been reshaped
* @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run
+ * @param[in] reshape_info (Optional) GEMM reshape information object
*/
- GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run)
- : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run)
+ GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo())
+ : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run), _reshape_info(reshape_info)
{
}
/** Flag which specifies if the matrix A has been reshaped
@@ -869,11 +952,20 @@ public:
{
return _reshape_b_only_on_first_run;
};
+ /** GEMMReshapeInfo object which stores the necessary information to understand how the matrix A and matrix B have been reshaped
+ *
+ * @return the GEMMReshapeInfo object
+ */
+ const GEMMReshapeInfo &reshape_info() const
+ {
+ return _reshape_info;
+ }
private:
- const bool _is_a_reshaped;
- const bool _is_b_reshaped;
- const bool _reshape_b_only_on_first_run;
+ const bool _is_a_reshaped;
+ const bool _is_b_reshaped;
+ const bool _reshape_b_only_on_first_run;
+ GEMMReshapeInfo _reshape_info;
};
/** IO formatting information class*/
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 61834b88a9..6ecfdf0323 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,12 +39,14 @@ inline TensorShape compute_permutation_output_shape(const ITensorInfo &input, co
permute(output_shape, perm);
return output_shape;
}
-inline TensorShape compute_interleaved_shape(const ITensorInfo &a)
+inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1)
{
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ // The interleaved output matrix will have the following shape: [ a_height * W, ceil(a_width / W) ] where W = 4 * mult_interleave4x4_height
+ ARM_COMPUTE_ERROR_ON(mult_interleave4x4_height < 1);
+ const int interleave_width = 4 * mult_interleave4x4_height;
TensorShape shape_interleaved_a{ a.tensor_shape() };
- shape_interleaved_a.set(0, a.dimension(0) * 4);
- shape_interleaved_a.set(1, std::ceil(a.dimension(1) / 4.f));
+ shape_interleaved_a.set(0, a.dimension(0) * interleave_width);
+ shape_interleaved_a.set(1, std::ceil(a.dimension(1) / static_cast<float>(interleave_width)));
return shape_interleaved_a;
}
@@ -57,12 +59,14 @@ inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
return shape_transposed1xW_b;
}
-inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b)
+inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b, int mult_transpose1xW_width = 1)
{
- // The transpose1xW output matrix will have the following shape:
- // [ b_height * (16 / element_size), ceil(b_width / (16.0f / element_size) ]
+ // Note: mult_transpose1xW_width expresses the number of chunks with size 1x(W) we want to store on the same row
+ // The transpose1xW output matrix will have the following shape:
+ // [ b_height * W, ceil(b_width / W) ] where W = (16 / element size of the tensor) * mult_transpose1xW_width
+ ARM_COMPUTE_ERROR_ON(mult_transpose1xW_width < 1);
TensorShape shape_transposed1xW_b{ b.tensor_shape() };
- const size_t transpose_width = 16 / b.element_size();
+ const size_t transpose_width = (16 / b.element_size()) * mult_transpose1xW_width;
shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width);
shape_transposed1xW_b.set(1, static_cast<size_t>(std::ceil(b.dimension(0) / static_cast<float>(transpose_width))));