aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2018-01-12 10:21:40 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:44:21 +0000
commit36a0a4608bf413fc1fd65eb335bfb736ef602149 (patch)
tree2ff0e35dc9e16fedd601b1f24bdc13d25d075b90
parent46edf63bd630f5e3f3eb31b7d4602caa317da075 (diff)
downloadComputeLibrary-36a0a4608bf413fc1fd65eb335bfb736ef602149.tar.gz
COMPMID-748 - Integrating optimized SGEMM for bifrost
This patch introduces a new GEMM capable to improve the mac utilisation of 10% compared to the GEMM without reshape. However this implementation is not faster in all cases as we need to take into account the time for reshaping the matrices. For this reason an heuristic solution to select the optimal GEMM to use has been added to the function. More information about the heuristic implementation can be found at COMPMID-852. With this new patch, GoogleNet, MobileNet, VGG16 and SqueezeNet can improved the performance of 1.5x. More information about the performance uplift can be found here: https://confluence.arm.com/display/MLENG/GEMM+FP32+performance%3A+ACL+18.02 Change-Id: I024563c06b9aed02a211a974e452bae5c233b04c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117140 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h18
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h10
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h18
-rw-r--r--arm_compute/core/Types.h108
-rw-r--r--arm_compute/core/utils/misc/ShapeCalculator.h22
-rw-r--r--arm_compute/graph/Graph.h8
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMM.h5
-rw-r--r--examples/graph_alexnet.cpp11
-rw-r--r--examples/graph_googlenet.cpp4
-rw-r--r--examples/graph_mobilenet.cpp2
-rw-r--r--examples/graph_squeezenet.cpp6
-rw-r--r--examples/graph_vgg16.cpp3
-rw-r--r--src/core/CL/CLKernelLibrary.cpp8
-rw-r--r--src/core/CL/cl_kernels/gemm.cl420
-rw-r--r--src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp59
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp75
-rw-r--r--src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp37
-rw-r--r--src/graph/Graph.cpp16
-rw-r--r--src/runtime/CL/functions/CLFullyConnectedLayer.cpp4
-rw-r--r--src/runtime/CL/functions/CLGEMM.cpp60
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp4
21 files changed, 521 insertions, 377 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
index 2520eff5de..c0fef45afe 100644
--- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,7 +47,7 @@ class ICLTensor;
* \end{array} \right)
* @f]
*
- * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ * After this operation, the output matrix will have the following shape: [ height * W, ceil(width / W) ] where W = 4 * mult_interleave4x4_height
*/
class CLGEMMInterleave4x4Kernel : public ICLKernel
{
@@ -64,18 +64,20 @@ public:
CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
- * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleave block
*/
- void configure(const ICLTensor *input, ICLTensor *output);
+ void configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel
*
- * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
- * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+ * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+ * @param[in] mult_interleave4x4_height Multiplication factor for the height of the 4x4 interleave block
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height);
// Inherited methods overridden
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 4e73d7eb13..7260c4a4f6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,8 +58,10 @@ public:
* @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
* @param[in] alpha Weight of the matrix product
* @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
+ * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
+ *
*/
- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true);
+ void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
*
* @param[in] input0 Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32
@@ -67,11 +69,13 @@ public:
* @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
* @param[in] alpha Weight of the matrix product
* @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
+ * @param[in] reshape_info GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
* @param[in] gpu_target GPU Target
*
* @return a status
*/
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target);
+ static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+ GPUTarget gpu_target);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
index 8721643c1e..9a3069eab6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,7 +62,7 @@ class ICLTensor;
* \end{array} \right)
* @f]
*
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width
*
*/
class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
@@ -70,18 +70,20 @@ class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
public:
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
- * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block
*/
- void configure(const ICLTensor *input, ICLTensor *output);
+ void configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMTranspose1xWKernel
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
- * @param[in] output Output tensor. Data type supported: same as @p input.
+ * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] output Output tensor. Data type supported: same as @p input.
+ * @param[in] mult_transpose1xW_width Multiplication factor for the width of the 1xW transposed block
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 5402e358b5..5197000bf9 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -824,13 +824,95 @@ private:
const unsigned int _num_kernels;
};
-/** GEMM Information class. This class stores the necessary information to compute GEMM functions */
+/** GEMM reshape information class. This class stores the necessary information about matrix A and matrix B reshape.
+ *
+ * The matrix A can only be reshaped through @ref CLGEMMInterleave4x4Kernel or @ref NEGEMMInterleave4x4Kernel or @ref GCGEMMInterleave4x4Kernel
+ * Note: Optionally just for @ref CLGEMMInterleave4x4Kernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block
+ *
+ * The matrix B can only be reshaped through @ref CLGEMMTranspose1xWKernel or @ref NEGEMMTranspose1xWKernel or @ref GCGEMMTranspose1xWKernel
+ * Note: Optionally just for @ref CLGEMMTranspose1xWKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block
+ *
+ */
+class GEMMReshapeInfo final
+{
+public:
+ /** Default constructor */
+ GEMMReshapeInfo()
+ : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1)
+ {
+ }
+ /** Constructor
+ *
+ * @param[in] m Number of matrix A rows
+ * @param[in] n Number of matrix B columns
+ * @param[in] k Number of matrix A columns or matrix B rows
+ * @param[in] mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block
+ * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleaved block
+ */
+ GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1)
+ : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height)
+ {
+ }
+ /** Number of matrix A rows
+ *
+ * @return the number of matrix A rows
+ */
+ int m() const
+ {
+ return _m;
+ }
+ /** Number of matrix B columns
+ *
+ * @return the number of matrix B columns
+ */
+ int n() const
+ {
+ return _n;
+ }
+ /** Number of matrix A columns or matrix B rows
+ *
+ * @return the number of matrix A columns or matrix B rows
+ */
+ int k() const
+ {
+ return _k;
+ }
+ /** Multiplication factor for the width of the 1xW transposed block
+ *
+ * @return the multiplication factor for the width of the 1xW transposed block
+ */
+ int mult_transpose1xW_width() const
+ {
+ return _mult_transpose1xW_width;
+ }
+ /** Multiplication factor for the height of the 4x4 interleaved block
+ *
+ * @return the multiplication factor for the height of the 4x4 interleaved block
+ */
+ int mult_interleave4x4_height() const
+ {
+ return _mult_interleave4x4_height;
+ }
+
+private:
+ const int _m;
+ const int _n;
+ const int _k;
+ const int _mult_transpose1xW_width;
+ const int _mult_interleave4x4_height;
+};
+
+/** GEMM information class. This class stores the necessary information to compute GEMM functions
+ *
+ * This object also contains the information about how matrix A and matrix B have been reshaped
+ *
+ */
class GEMMInfo
{
public:
/** Default constructor */
GEMMInfo()
- : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false)
+ : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false), _reshape_info()
{
}
/** Constructor
@@ -838,9 +920,10 @@ public:
* @param[in] is_a_reshaped True if the matrix A has been reshaped
* @param[in] is_b_reshaped True if the matrix B has been reshaped
* @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run
+ * @param[in] reshape_info (Optional) GEMM reshape information object
*/
- GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run)
- : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run)
+ GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo())
+ : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run), _reshape_info(reshape_info)
{
}
/** Flag which specifies if the matrix A has been reshaped
@@ -869,11 +952,20 @@ public:
{
return _reshape_b_only_on_first_run;
};
+ /** GEMMReshapeInfo object which stores the necessary information to understand how the matrix A and matrix B have been reshaped
+ *
+ * @return the GEMMReshapeInfo object
+ */
+ const GEMMReshapeInfo &reshape_info() const
+ {
+ return _reshape_info;
+ }
private:
- const bool _is_a_reshaped;
- const bool _is_b_reshaped;
- const bool _reshape_b_only_on_first_run;
+ const bool _is_a_reshaped;
+ const bool _is_b_reshaped;
+ const bool _reshape_b_only_on_first_run;
+ GEMMReshapeInfo _reshape_info;
};
/** IO formatting information class*/
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 61834b88a9..6ecfdf0323 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,12 +39,14 @@ inline TensorShape compute_permutation_output_shape(const ITensorInfo &input, co
permute(output_shape, perm);
return output_shape;
}
-inline TensorShape compute_interleaved_shape(const ITensorInfo &a)
+inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1)
{
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ // The interleaved output matrix will have the following shape: [ a_height * W, ceil(a_width / W) ] where W = 4 * mult_interleave4x4_height
+ ARM_COMPUTE_ERROR_ON(mult_interleave4x4_height < 1);
+ const int interleave_width = 4 * mult_interleave4x4_height;
TensorShape shape_interleaved_a{ a.tensor_shape() };
- shape_interleaved_a.set(0, a.dimension(0) * 4);
- shape_interleaved_a.set(1, std::ceil(a.dimension(1) / 4.f));
+ shape_interleaved_a.set(0, a.dimension(0) * interleave_width);
+ shape_interleaved_a.set(1, std::ceil(a.dimension(1) / static_cast<float>(interleave_width)));
return shape_interleaved_a;
}
@@ -57,12 +59,14 @@ inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
return shape_transposed1xW_b;
}
-inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b)
+inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b, int mult_transpose1xW_width = 1)
{
- // The transpose1xW output matrix will have the following shape:
- // [ b_height * (16 / element_size), ceil(b_width / (16.0f / element_size) ]
+ // Note: mult_transpose1xW_width expresses the number of chunks with size 1x(W) we want to store on the same row
+ // The transpose1xW output matrix will have the following shape:
+ // [ b_height * W, ceil(b_width / W) ] where W = (16 / element size of the tensor) * mult_transpose1xW_width
+ ARM_COMPUTE_ERROR_ON(mult_transpose1xW_width < 1);
TensorShape shape_transposed1xW_b{ b.tensor_shape() };
- const size_t transpose_width = 16 / b.element_size();
+ const size_t transpose_width = (16 / b.element_size()) * mult_transpose1xW_width;
shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width);
shape_transposed1xW_b.set(1, static_cast<size_t>(std::ceil(b.dimension(0) / static_cast<float>(transpose_width))));
diff --git a/arm_compute/graph/Graph.h b/arm_compute/graph/Graph.h
index ab1d8b8866..853b90df82 100644
--- a/arm_compute/graph/Graph.h
+++ b/arm_compute/graph/Graph.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#ifndef __ARM_COMPUTE_GRAPH_GRAPH_H__
#define __ARM_COMPUTE_GRAPH_GRAPH_H__
+#include "arm_compute/core/CL/CLTypes.h"
#include "arm_compute/graph/INode.h"
#include "arm_compute/graph/ITensorObject.h"
#include "arm_compute/graph/SubTensor.h"
@@ -67,9 +68,12 @@ public:
* @param[in] tensor Tensor to add
*/
void add_tensor_object(std::unique_ptr<ITensorObject> tensor);
- /** Finalizes the current node's configuration
+ /** Check if the OpenCL target is available
*/
static bool opencl_is_available();
+ /** Returns the GPU target
+ */
+ static GPUTarget gpu_target();
/** Manually sets the output of the current node
*
* @param[in] tmp Output info to set
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index bf41226bda..0f144915d7 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -68,7 +68,8 @@ public:
* @param[in] alpha Weight of the matrix product
* @param[in] beta Weight of matrix C
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
- * if the reshape of matrix B should happen only for the first run
+ * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
+ * in case matrix A and matrix B have been already transformed.
*/
void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
diff --git a/examples/graph_alexnet.cpp b/examples/graph_alexnet.cpp
index 8705c8ed1e..2f2c8bd182 100644
--- a/examples/graph_alexnet.cpp
+++ b/examples/graph_alexnet.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,8 +54,10 @@ public:
constexpr float mean_b = 104.01f; /* Mean value to subtract from blue channel */
// Set target. 0 (NEON), 1 (OpenCL). By default it is NEON
- TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
- ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT;
+ TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
+
+ const bool is_gemm_convolution5x5 = Graph::gpu_target() == arm_compute::GPUTarget::MIDGARD || target_hint == TargetHint::NEON;
+ ConvolutionMethodHint convolution_5x5_hint = is_gemm_convolution5x5 ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT;
// Parse arguments
if(argc < 2)
@@ -102,7 +104,7 @@ public:
<< NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f))
<< PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)))
// Layer 2
- << convolution_hint
+ << convolution_5x5_hint
<< ConvolutionLayer(
5U, 5U, 256U,
get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_w.npy"),
@@ -111,6 +113,7 @@ public:
<< ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
<< NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f))
<< PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)))
+ << ConvolutionMethodHint::GEMM
// Layer 3
<< ConvolutionLayer(
3U, 3U, 384U,
diff --git a/examples/graph_googlenet.cpp b/examples/graph_googlenet.cpp
index 1e9601b492..b2e2f1bf8f 100644
--- a/examples/graph_googlenet.cpp
+++ b/examples/graph_googlenet.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,7 +55,7 @@ public:
// Set target. 0 (NEON), 1 (OpenCL). By default it is NEON
TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
- ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT;
+ ConvolutionMethodHint convolution_hint = ConvolutionMethodHint::GEMM;
// Parse arguments
if(argc < 2)
diff --git a/examples/graph_mobilenet.cpp b/examples/graph_mobilenet.cpp
index 8c3f9b6fbc..83d1db9f0f 100644
--- a/examples/graph_mobilenet.cpp
+++ b/examples/graph_mobilenet.cpp
@@ -52,7 +52,7 @@ public:
// Set target. 0 (NEON), 1 (OpenCL). By default it is NEON
TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
- ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT;
+ ConvolutionMethodHint convolution_hint = ConvolutionMethodHint::GEMM;
// Set model to execute. 0 (MobileNetV1_1.0_224), 1 (MobileNetV1_0.75_160)
int model_id = (argc > 2) ? std::strtol(argv[2], nullptr, 10) : 0;
diff --git a/examples/graph_squeezenet.cpp b/examples/graph_squeezenet.cpp
index b21f2fe5c4..e85108702d 100644
--- a/examples/graph_squeezenet.cpp
+++ b/examples/graph_squeezenet.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,8 +59,7 @@ public:
constexpr float mean_b = 104.01f; /* Mean value to subtract from blue channel */
// Set target. 0 (NEON), 1 (OpenCL). By default it is NEON
- TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
- ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT;
+ TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
// Parse arguments
if(argc < 2)
@@ -102,7 +101,6 @@ public:
get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_w.npy"),
get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_b.npy"),
PadStrideInfo(2, 2, 0, 0))
- << convolution_hint
<< ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
<< PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
<< ConvolutionLayer(
diff --git a/examples/graph_vgg16.cpp b/examples/graph_vgg16.cpp
index 1a804a4882..d97c5b5d02 100644
--- a/examples/graph_vgg16.cpp
+++ b/examples/graph_vgg16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,7 +91,6 @@ public:
<< convolution_hint
<< Tensor(TensorInfo(TensorShape(224U, 224U, 3U, 1U), 1, DataType::F32),
get_input_accessor(image, mean_r, mean_g, mean_b))
- << ConvolutionMethodHint::DIRECT
// Layer 1
<< ConvolutionLayer(
3U, 3U, 64U,
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 6695881d09..ae3553860a 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -211,9 +211,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
{ "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
{ "gemm_accumulate_biases", "gemm.cl" },
- { "gemm_interleave4x4_8bit", "gemm.cl" },
- { "gemm_interleave4x4_16bit", "gemm.cl" },
- { "gemm_interleave4x4_32bit", "gemm.cl" },
+ { "gemm_interleave4x4", "gemm.cl" },
{ "gemm_ma_f16", "gemm.cl" },
{ "gemm_ma_f32", "gemm.cl" },
{ "gemm_ma_qs8", "gemm.cl" },
@@ -230,9 +228,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "gemm_mm_qs8", "gemm.cl" },
{ "gemm_mm_qs16", "gemm.cl" },
{ "gemm_lc_vm_f32", "gemm.cl" },
- { "gemm_transpose1x16", "gemm.cl" },
- { "gemm_transpose1x8", "gemm.cl" },
- { "gemm_transpose1x4", "gemm.cl" },
+ { "gemm_transpose1xW", "gemm.cl" },
{ "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
{ "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
{ "gemmlowp_mm_bifrost", "gemmlowp.cl" },
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index c763cb355b..bad09f3c42 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,41 +27,23 @@
#include "fixed_point.h"
#endif // FIXED_POINT_POSITION
-/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
-{
- uint x = get_global_id(0);
- uint y = get_global_id(1);
+#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
- // Compute address for Matrix B - source
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
- // Compute address for Matrix B transposed - destination. X and Y are swapped
- uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+#if TRANSPOSE_W == 4
+#define DATA_TYPE uint
+#elif TRANSPOSE_W == 8
+#define DATA_TYPE ushort
+#elif TRANSPOSE_W == 16
+#define DATA_TYPE uchar
+#else // TRANSPOSE_W == 16
+#error "Transpose width not supported"
+#endif // TRANSPOSE_W
- uint4 b0 = vload4(0, (__global uint *)src.ptr);
-
- vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes));
-}
-
-/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
+/** This OpenCL kernel computes the "vector" 1xW transposition of input matrix
+ *
+ * @attention The multiplication factor (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
@@ -69,12 +51,12 @@ __kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
+__kernel void gemm_transpose1xW(IMAGE_DECLARATION(src),
IMAGE_DECLARATION(dst))
{
uint x = get_global_id(0);
@@ -84,16 +66,22 @@ __kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
Image src = CONVERT_TO_IMAGE_STRUCT(src);
// Compute address for Matrix B transposed - destination. X and Y are swapped
- uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+ uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +
+ (x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);
- ushort8 b0 = vload8(0, (__global ushort *)src.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)
+ b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);
- vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes));
+ VSTORE(TRANSPOSE_W)
+ (b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));
}
+#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
-/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
+#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
@@ -106,9 +94,10 @@ __kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
+__kernel void gemm_interleave4x4(IMAGE_DECLARATION(src),
IMAGE_DECLARATION(dst))
{
+ // Compute source and destination addresses
uint x = get_global_id(0);
uint y = get_global_id(1);
@@ -116,141 +105,35 @@ __kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
Image src = CONVERT_TO_IMAGE_STRUCT(src);
// Compute address for Matrix B transposed - destination. X and Y are swapped
- uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
-
- uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
-
- vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
-{
- // Compute source and destination addresses
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- // Load values from Matrix A
- uint4 a0 = vload4(0, (__global uint *)(offset(&src, 0, 0)));
- uint4 a1 = vload4(0, (__global uint *)(offset(&src, 0, 1)));
- uint4 a2 = vload4(0, (__global uint *)(offset(&src, 0, 2)));
- uint4 a3 = vload4(0, (__global uint *)(offset(&src, 0, 3)));
-
- uint4 val0 = (uint4)(a0.s0, a1.s0, a2.s0, a3.s0);
- vstore4(val0, 0, ((__global uint *)dst.ptr) + 0);
-
- val0 = (uint4)(a0.s1, a1.s1, a2.s1, a3.s1);
- vstore4(val0, 0, ((__global uint *)dst.ptr) + 4);
-
- val0 = (uint4)(a0.s2, a1.s2, a2.s2, a3.s2);
- vstore4(val0, 0, ((__global uint *)dst.ptr) + 8);
-
- val0 = (uint4)(a0.s3, a1.s3, a2.s3, a3.s3);
- vstore4(val0, 0, ((__global uint *)dst.ptr) + 12);
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
-{
- // Compute source and destination addresses
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- // Load values from Matrix A
- ushort8 a0 = vload8(0, (__global ushort *)(offset(&src, 0, 0)));
- ushort8 a1 = vload8(0, (__global ushort *)(offset(&src, 0, 1)));
- ushort8 a2 = vload8(0, (__global ushort *)(offset(&src, 0, 2)));
- ushort8 a3 = vload8(0, (__global ushort *)(offset(&src, 0, 3)));
-
- ushort8 val0 = (ushort8)((ushort4)(a0.s0, a1.s0, a2.s0, a3.s0), (ushort4)(a0.s1, a1.s1, a2.s1, a3.s1));
- vstore8(val0, 0, ((__global ushort *)dst.ptr) + 0);
-
- val0 = (ushort8)((ushort4)(a0.s2, a1.s2, a2.s2, a3.s2), (ushort4)(a0.s3, a1.s3, a2.s3, a3.s3));
- vstore8(val0, 0, ((__global ushort *)dst.ptr) + 8);
-
- val0 = (ushort8)((ushort4)(a0.s4, a1.s4, a2.s4, a3.s4), (ushort4)(a0.s5, a1.s5, a2.s5, a3.s5));
- vstore8(val0, 0, ((__global ushort *)dst.ptr) + 16);
-
- val0 = (ushort8)((ushort4)(a0.s6, a1.s6, a2.s6, a3.s6), (ushort4)(a0.s7, a1.s7, a2.s7, a3.s7));
- vstore8(val0, 0, ((__global ushort *)dst.ptr) + 24);
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
-{
- // Compute source and destination addresses
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+ uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +
+ (y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);
// Load values from Matrix A
- uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
- uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
- uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
- uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
-
- uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
- (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
- vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
-
- val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
- (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
- vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
-
- val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
- (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
- vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
-
- val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
- (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
- vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ a0 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ a1 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ a2 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ a3 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
+
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
+ vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
+
+ val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);
+ vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+
+ val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);
+ vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
+
+ val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);
+ vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
}
+#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
-#if defined(COLS_B)
+#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
@@ -270,30 +153,32 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
__kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
{
- // src_addr.s0 = address of matrix A
- // src_addr.s1 = address of matrix B
-
- // Compute address for matrix A and B
- int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
- (src1_stride_y));
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
- // Add offset_first_element_in_bytes
- src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
- // Divide by 4 in order to get the src_addr in unit of float
- src_addr = src_addr >> 2;
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
// Compute end row address for matrix B
- int end_row_mtx_b = src_addr.s1 + COLS_B;
+ __global float *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
// Reset accumulators
float4 c00 = 0.0f;
@@ -301,11 +186,11 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
float4 c20 = 0.0f;
float4 c30 = 0.0f;
- for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
+ for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
- float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+ float4 a0 = vload4(0, src_addr_a);
+ float4 b0 = vload4(0, src_addr_b);
c00 += (float4)a0.s0 * b0;
c10 += (float4)a0.s1 * b0;
@@ -313,8 +198,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
c30 += (float4)a0.s3 * b0;
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
- b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
+ a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
c00 += (float4)a0.s0 * b0;
c10 += (float4)a0.s1 * b0;
@@ -322,11 +207,11 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
c30 += (float4)a0.s3 * b0;
}
- for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
- float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+ float4 a0 = vload4(0, src_addr_a);
+ float4 b0 = vload4(0, src_addr_b);
c00 += (float4)a0.s0 * b0;
c10 += (float4)a0.s1 * b0;
@@ -371,23 +256,33 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
{
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
- __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
+ __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
// Compute end row address for matrix B
__global float *src_end_addr_b = src_addr_b + COLS_B;
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
+
// Reset accumulators
float c00 = 0.0f;
float c01 = 0.0f;
@@ -406,7 +301,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
float c32 = 0.0f;
float c33 = 0.0f;
- for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
+ for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += (16 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (16 * MULT_TRANSPOSE1XW_WIDTH))
{
// Load values from matrix A (interleaved) and matrix B (transposed)
float4 a0 = vload4(0, src_addr_a);
@@ -433,8 +328,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
c33 = fma(a0.s3, b0.s3, c33);
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, src_addr_a + 4);
- b0 = vload4(0, src_addr_b + 4);
+ a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
c00 = fma(a0.s0, b0.s0, c00);
c01 = fma(a0.s0, b0.s1, c01);
@@ -457,8 +352,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
c33 = fma(a0.s3, b0.s3, c33);
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, src_addr_a + 8);
- b0 = vload4(0, src_addr_b + 8);
+ a0 = vload4(0, src_addr_a + 8 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload4(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
c00 = fma(a0.s0, b0.s0, c00);
c01 = fma(a0.s0, b0.s1, c01);
@@ -481,8 +376,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
c33 = fma(a0.s3, b0.s3, c33);
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, src_addr_a + 12);
- b0 = vload4(0, src_addr_b + 12);
+ a0 = vload4(0, src_addr_a + 12 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload4(0, src_addr_b + 12 * MULT_TRANSPOSE1XW_WIDTH);
c00 = fma(a0.s0, b0.s0, c00);
c01 = fma(a0.s0, b0.s1, c01);
@@ -505,7 +400,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
c33 = fma(a0.s3, b0.s3, c33);
}
- for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
+ for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * MULT_TRANSPOSE1XW_WIDTH))
{
// Load values from matrix A (interleaved) and matrix B (transposed)
float4 a0 = vload4(0, src_addr_a);
@@ -555,8 +450,6 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
c33 = c33 * ALPHA;
#endif // defined(ALPHA)
- barrier(CLK_GLOBAL_MEM_FENCE);
-
// Store 4x4 block
vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
@@ -584,30 +477,32 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
{
- // src_addr.s0 = address of matrix A
- // src_addr.s1 = address of matrix B
-
- // Compute address for matrix A and B
- int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
- (src1_stride_y));
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
- // Add offset_first_element_in_bytes
- src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
- // Divide by 2 in order to get the src_addr in unit of half
- src_addr = src_addr >> 1;
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ __global half *src_addr_a = (__global half *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global half *src_addr_b = (__global half *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
// Compute end row address for matrix B
- int end_row_mtx_b = src_addr.s1 + COLS_B;
+ __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
// Reset accumulators
half8 c00 = 0.0f;
@@ -615,11 +510,11 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
half8 c20 = 0.0f;
half8 c30 = 0.0f;
- for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16))
+ for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
- half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+ half4 a0 = vload4(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
c00 += (half8)a0.s0 * b0;
c10 += (half8)a0.s1 * b0;
@@ -627,8 +522,8 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
c30 += (half8)a0.s3 * b0;
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
- b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
+ a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
c00 += (half8)a0.s0 * b0;
c10 += (half8)a0.s1 * b0;
@@ -636,11 +531,11 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
c30 += (half8)a0.s3 * b0;
}
- for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
- half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+ half4 a0 = vload4(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
c00 += (half8)a0.s0 * b0;
c10 += (half8)a0.s1 * b0;
@@ -689,27 +584,32 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
{
- // src_addr.s0 = address of matrix A
- // src_addr.s1 = address of matrix B
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
- // Compute address for matrix A and B
- int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
- (src1_stride_y));
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 16;
- // Add offset_first_element_in_bytes
- src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ __global char *src_addr_a = src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ __global char *src_addr_b = src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes;
// Compute end row address for matrix B
- int end_row_mtx_b = src_addr.s1 + COLS_B;
+ __global char *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
// Reset accumulators
short8 c00 = 0.0f;
@@ -722,11 +622,11 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
short8 c31 = 0.0f;
// This for loop performs 1 accumulation for each iteration
- for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- char4 a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);
- char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);
+ char4 a0 = vload4(0, src_addr_a);
+ char16 b0 = vload16(0, src_addr_b);
c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);
@@ -783,30 +683,32 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
{
- // src_addr.s0 = address of matrix A
- // src_addr.s1 = address of matrix B
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
- // Compute address for matrix A and B
- int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
- (src1_stride_y));
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
- // Add offset_first_element_in_bytes
- src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
- // Divide by 2 in order to get the src_addr in unit of short
- src_addr = src_addr >> 1;
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ __global short *src_addr_a = (__global short *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global short *src_addr_b = (__global short *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
// Compute end row address for matrix B
- int end_row_mtx_b = src_addr.s1 + COLS_B;
+ __global short *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
// Reset accumulators
int8 c00 = 0.0f;
@@ -815,11 +717,11 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
int8 c30 = 0.0f;
// This for loop performs 1 accumulation for each iteration
- for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
{
/* Load values from matrix A (interleaved) and matrix B (transposed) */
- short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0);
- short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1);
+ short4 a0 = vload4(0, src_addr_a);
+ short8 b0 = vload8(0, src_addr_b);
c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);
c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);
@@ -850,7 +752,7 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
}
#endif // defined(FIXED_POINT_POSITION)
-#endif // defined(COLS_B)
+#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
#if defined(DATA_TYPE)
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 6886f54602..241dd8549d 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,17 +40,16 @@ using namespace arm_compute::misc::shape_calculator;
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height)
{
+ ARM_COMPUTE_RETURN_ERROR_ON(mult_interleave4x4_height < 1);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input, mult_interleave4x4_height));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
@@ -58,11 +57,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, int mult_interleave4x4_height)
{
- unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->data_type());
+ constexpr unsigned int num_elems_processed_per_iteration_x = 4;
constexpr unsigned int num_elems_processed_per_iteration_y = 4;
- const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+ const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y * mult_interleave4x4_height;
bool window_changed = false;
// Configure kernel window
@@ -73,7 +72,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// Configure window in case of configured output
if(output->total_size() != 0)
{
- AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+ const float scale_x = 4.0f * static_cast<float>(mult_interleave4x4_height);
+ const float scale_y = 1.0f / (scale_x);
+
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, scale_x, scale_y);
window_changed = window_changed || update_window_and_padding(win, output_access);
output_access.set_valid_region(win, input->valid_region());
}
@@ -88,25 +90,42 @@ CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel()
{
}
-void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info())));
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info(), mult_interleave4x4_height)));
// Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_interleave4x4_height));
_input = input;
_output = output;
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+ switch(input->info()->element_size())
+ {
+ case 1:
+ build_opts.add_option("-DDATA_TYPE=uchar");
+ break;
+ case 2:
+ build_opts.add_option("-DDATA_TYPE=ushort");
+ break;
+ case 4:
+ build_opts.add_option("-DDATA_TYPE=uint");
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ }
+
// Create kernel
- std::string kernel_name = "gemm_interleave4x4_" + support::cpp11::to_string(input->info()->element_size() * 8) + "bit";
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4", build_opts.options()));
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
+ auto win_config = validate_and_configure_window(input->info(), output->info(), mult_interleave4x4_height);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure(win_config.second);
@@ -119,10 +138,10 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out
_config_id += support::cpp11::to_string(output->info()->dimension(1));
}
-Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_interleave4x4_height));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mult_interleave4x4_height).first);
return Status{};
}
@@ -144,10 +163,6 @@ void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queu
Window in_slice = window.first_slice_window_2D();
Window out_slice = window.first_slice_window_2D();
- // Change x and y steps for the slide of output tensor
- out_slice.scale(Window::DimX, 4.f);
- out_slice.scale(Window::DimY, 0.25f);
-
do
{
unsigned int idx = 0;
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 19f38bf5a5..e23feb269a 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,24 +36,68 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include <set>
#include <string>
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
namespace
{
using ElementsProcessed = Steps;
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
+
if(!is_interleaved_transposed)
{
ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+ }
+ }
+ else
+ {
+ const int m = reshape_info.m();
+ const int n = reshape_info.n();
+ const int k = reshape_info.k();
+ const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
+ const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+ TensorShape tensor_shape0{ input0->tensor_shape() };
+ tensor_shape0.set(0, k);
+ tensor_shape0.set(1, m);
+
+ TensorShape tensor_shape1{ input1->tensor_shape() };
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+ const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+ const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+ const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+ }
}
return Status{};
@@ -122,12 +166,19 @@ CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
{
}
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed)
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+ // Output tensor auto inizialitation if not yet initialized
+ TensorShape tensor_shape{ input0->info()->tensor_shape() };
+ tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
+ tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
+
+ auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));
+
// Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
_input0 = input0;
_input1 = input1;
@@ -176,7 +227,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
std::string kernel_name;
if(is_interleaved_transposed)
{
+ const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
+ const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+ build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
+ build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+
if(data_type == DataType::F32)
{
kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target);
@@ -230,11 +287,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
}
-Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target)
+Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
+ const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
{
+ // Note: num_elements_processed will be set in validate_and_configure_window()
ElementsProcessed num_elements_processed{};
ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
input1->clone().get(),
output->clone().get(),
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 69a545b76b..63aed6df32 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,8 +42,9 @@ using namespace arm_compute::misc::shape_calculator;
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width)
{
+ ARM_COMPUTE_RETURN_ERROR_ON(mult_transpose1xW_width < 1);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
@@ -51,7 +52,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
- compute_transpose1xW_with_element_size_shape(*input));
+ compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
@@ -59,11 +60,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration, int mult_transpose1xW_width)
{
num_elems_processed_per_iteration = 16 / input->element_size();
- const int scale_x = num_elems_processed_per_iteration;
+ const int scale_x = num_elems_processed_per_iteration * mult_transpose1xW_width;
bool window_changed = false;
// Configure kernel window
@@ -90,25 +91,31 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
} // namespace
-void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info())));
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info(), mult_transpose1xW_width)));
// Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_transpose1xW_width));
_input = input;
_output = output;
// Configure kernel window
+ // Note: num_elems_processed_per_iteration will be set in validate_and_configure_window()
unsigned int num_elems_processed_per_iteration = 1;
- auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration);
+ auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, mult_transpose1xW_width);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure(win_config.second);
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DTRANSPOSE_W=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
+
/*
* Following an example of how the transposition1xW works when the input data type is F32
*
@@ -117,18 +124,18 @@ void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *outp
* |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
* |a30 a31 a32 a33|
*
- * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width
*/
// Create kernel
- std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration);
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+ std::string kernel_name = "gemm_transpose1xW";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
}
-Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width)
{
unsigned int num_elems_processed_per_iteration = 1;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_transpose1xW_width));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, mult_transpose1xW_width).first);
return Status{};
}
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index ac5316f55e..e14bea0846 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -215,11 +215,25 @@ void Graph::add_tensor_object(std::unique_ptr<ITensorObject> tensor)
_pimpl->_graph_output->allocate();
}
}
+
bool Graph::opencl_is_available()
{
return arm_compute::opencl_is_available();
}
+arm_compute::GPUTarget Graph::gpu_target()
+{
+ // Check if OpenCL is available before returning the GPU target
+ if(opencl_is_available())
+ {
+ return arm_compute::CLScheduler::get().target();
+ }
+ else
+ {
+ return GPUTarget::MIDGARD;
+ }
+}
+
void Graph::set_temp(TensorInfo &&tmp)
{
ARM_COMPUTE_ERROR_ON(_pimpl->_graph_input == nullptr);
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 68c6576a79..e9d14db96e 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,7 +55,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, gpu_target));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, GEMMReshapeInfo(), gpu_target));
}
return Status{};
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index c676a10978..a09849ab93 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,6 +38,30 @@
using namespace arm_compute;
+namespace
+{
+inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+{
+ bool flag = true;
+
+ if(gpu_target == GPUTarget::BIFROST)
+ {
+ // COMPMID-852
+ if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
+ {
+ const float scale = k < 1024 ? 2.0f : 2.5f;
+ flag = scale * n > 1.66f * n + 38.4f;
+ }
+ else
+ {
+ flag = false;
+ }
+ }
+
+ return flag;
+}
+} // namespace
+
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
_is_first_run(true), _reshape_b_only_on_first_run(false)
@@ -62,18 +86,36 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
- // For Bifrost architectures we do not reshape the input matrices
- _is_interleaved_transposed = (a->info()->dimension(1) > 16 && CLScheduler::get().target() != GPUTarget::BIFROST);
-
// Check if we need to reshape the matrix B only on the first run
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
const ICLTensor *matrix_a = a;
const ICLTensor *matrix_b = b;
- // Set the target for the matrix multiply kernel
- _mm_kernel.set_target(CLScheduler::get().target());
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Set the target for the kernels
+ _interleave_kernel.set_target(gpu_target);
+ _mm_kernel.set_target(gpu_target);
+
+ // Arguments used by GEMMReshapeInfo
+ // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+ // in order to know how the matrices have been reshaped
+ const int m = a->info()->dimension(1);
+ const int n = b->info()->dimension(0);
+ const int k = a->info()->dimension(0);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+
+ if(gpu_target == GPUTarget::BIFROST)
+ {
+ mult_transpose1xW_width = 4;
+ mult_interleave4x4_height = 2;
+ }
+
+ // Check if we need to reshape the matrix A and matrix B
+ _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
if(_is_interleaved_transposed)
{
@@ -83,17 +125,17 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
// _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
// Configure interleave kernel
- _interleave_kernel.configure(a, &_tmp_a);
+ _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
// Configure transpose kernel
- _transpose_kernel.configure(b, &_tmp_b);
+ _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
// Manage intermediate buffers
_memory_group.manage(&_tmp_a);
_memory_group.manage(&_tmp_b);
}
- _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+ _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
if(_is_interleaved_transposed)
{
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2cd426b82d..5f886a02c6 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -148,8 +148,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
TensorInfo info_a(compute_interleaved_shape(*a), 1, a->data_type());
TensorInfo info_b(compute_transpose1xW_shape(*b), 1, b->data_type());
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a, 1));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b, 1));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
}
else