diff options
author | Gian Marco <gianmarco.iodice@arm.com> | 2018-01-30 13:35:54 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:47:18 +0000 |
commit | 19835e591cb0b66a0f5000ae1505bf299e50337d (patch) | |
tree | 525ee8b233a2cefe3b2734d76fdb91093b8c2d50 /src/runtime/CL/functions/CLGEMM.cpp | |
parent | 6fa009e05ae32e64f397f54087885c3eb68f0b4b (diff) | |
download | ComputeLibrary-19835e591cb0b66a0f5000ae1505bf299e50337d.tar.gz |
COMPMID-882 - Optimizing GEMMLowp on OpenCL reshaping matrices
This new optimization allows to achieve 36.3 % of MAC utilisation on Mate 9 @ 1GHz.
The performance have been reported here
https://confluence.arm.com/display/MLENG/GEMMLowp+performance%3A+ACL+18.02
Change-Id: I71b6a217068763dfdc11bbf3574ee0eb94f93679
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118531
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/runtime/CL/functions/CLGEMM.cpp')
-rw-r--r-- | src/runtime/CL/functions/CLGEMM.cpp | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index a09849ab93..f02eb169b7 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -50,7 +50,7 @@ inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, b if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run) { const float scale = k < 1024 ? 2.0f : 2.5f; - flag = scale * n > 1.66f * n + 38.4f; + flag = (scale * n) > ((1.66f * n) + 38.4f); } else { @@ -122,6 +122,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * matrix_a = &_tmp_a; matrix_b = &_tmp_b; + // Manage intermediate buffers + _memory_group.manage(&_tmp_a); + _memory_group.manage(&_tmp_b); + // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel // Configure interleave kernel @@ -129,10 +133,6 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * // Configure transpose kernel _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width); - - // Manage intermediate buffers - _memory_group.manage(&_tmp_a); - _memory_group.manage(&_tmp_b); } _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height)); |