aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/CL/functions/CLGEMM.cpp
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2018-01-12 10:21:40 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:44:21 +0000
commit36a0a4608bf413fc1fd65eb335bfb736ef602149 (patch)
tree2ff0e35dc9e16fedd601b1f24bdc13d25d075b90 /src/runtime/CL/functions/CLGEMM.cpp
parent46edf63bd630f5e3f3eb31b7d4602caa317da075 (diff)
downloadComputeLibrary-36a0a4608bf413fc1fd65eb335bfb736ef602149.tar.gz
COMPMID-748 - Integrating optimized SGEMM for bifrost
This patch introduces a new GEMM capable to improve the mac utilisation of 10% compared to the GEMM without reshape. However this implementation is not faster in all cases as we need to take into account the time for reshaping the matrices. For this reason an heuristic solution to select the optimal GEMM to use has been added to the function. More information about the heuristic implementation can be found at COMPMID-852. With this new patch, GoogleNet, MobileNet, VGG16 and SqueezeNet can improved the performance of 1.5x. More information about the performance uplift can be found here: https://confluence.arm.com/display/MLENG/GEMM+FP32+performance%3A+ACL+18.02 Change-Id: I024563c06b9aed02a211a974e452bae5c233b04c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117140 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/runtime/CL/functions/CLGEMM.cpp')
-rw-r--r--src/runtime/CL/functions/CLGEMM.cpp60
1 files changed, 51 insertions, 9 deletions
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index c676a10978..a09849ab93 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,6 +38,30 @@
using namespace arm_compute;
+namespace
+{
+inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+{
+ bool flag = true;
+
+ if(gpu_target == GPUTarget::BIFROST)
+ {
+ // COMPMID-852
+ if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
+ {
+ const float scale = k < 1024 ? 2.0f : 2.5f;
+ flag = scale * n > 1.66f * n + 38.4f;
+ }
+ else
+ {
+ flag = false;
+ }
+ }
+
+ return flag;
+}
+} // namespace
+
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
_is_first_run(true), _reshape_b_only_on_first_run(false)
@@ -62,18 +86,36 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
- // For Bifrost architectures we do not reshape the input matrices
- _is_interleaved_transposed = (a->info()->dimension(1) > 16 && CLScheduler::get().target() != GPUTarget::BIFROST);
-
// Check if we need to reshape the matrix B only on the first run
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
const ICLTensor *matrix_a = a;
const ICLTensor *matrix_b = b;
- // Set the target for the matrix multiply kernel
- _mm_kernel.set_target(CLScheduler::get().target());
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Set the target for the kernels
+ _interleave_kernel.set_target(gpu_target);
+ _mm_kernel.set_target(gpu_target);
+
+ // Arguments used by GEMMReshapeInfo
+ // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+ // in order to know how the matrices have been reshaped
+ const int m = a->info()->dimension(1);
+ const int n = b->info()->dimension(0);
+ const int k = a->info()->dimension(0);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+
+ if(gpu_target == GPUTarget::BIFROST)
+ {
+ mult_transpose1xW_width = 4;
+ mult_interleave4x4_height = 2;
+ }
+
+ // Check if we need to reshape the matrix A and matrix B
+ _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
if(_is_interleaved_transposed)
{
@@ -83,17 +125,17 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
// _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
// Configure interleave kernel
- _interleave_kernel.configure(a, &_tmp_a);
+ _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
// Configure transpose kernel
- _transpose_kernel.configure(b, &_tmp_b);
+ _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
// Manage intermediate buffers
_memory_group.manage(&_tmp_a);
_memory_group.manage(&_tmp_b);
}
- _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+ _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
if(_is_interleaved_transposed)
{