COMPMID-816 - Optimizing CLGEMMLowpMatrixMultiplyCore - Part1

The performance improvements have been reported at the following confluence page: https://confluence.arm.com/display/MLENG/GEMMLowp+performance%3A+ACL+18.02 Config3 of McVail looks improved by 29x Change-Id: I8b203c0b75fc368f85cea863b7eed398fab3e79a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/115783 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
author: Gian Marco <gianmarco.iodice@arm.com> 2018-01-10 15:56:30 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:43:42 +0000
commit: 7b4d547800d3ea49e7e6d9f497ec2766411cb948 (patch)
tree: efff27f22ed692314690a666dc4c0e23838b78c5 /src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
parent: dde3ad94dc11fae29dba862a1ad657f551f36763 (diff)
download: ComputeLibrary-7b4d547800d3ea49e7e6d9f497ec2766411cb948.tar.gz
1 files changed, 6 insertions, 3 deletions
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index ddcab6a256..2cd426b82d 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,10 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
     _b_offset                    = b->info()->quantization_info().offset;
 
     // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors
-    _is_interleaved_transposed = a->info()->dimension(1) > 16;
+    _is_interleaved_transposed = (a->info()->dimension(1)) > 16 && (CLScheduler::get().target() != GPUTarget::BIFROST);
+
+    // Set the target for the matrix multiply kernel
+    _mm_kernel.set_target(CLScheduler::get().target());
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -138,7 +141,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
 
     int32_t a_offset                  = a->quantization_info().offset;
     int32_t b_offset                  = b->quantization_info().offset;
-    bool    is_interleaved_transposed = a->dimension(1) > 16;
+    bool    is_interleaved_transposed = (a->dimension(1)) > 16 && (CLScheduler::get().target() != GPUTarget::BIFROST);
 
     if(is_interleaved_transposed)
     {
author	Gian Marco <gianmarco.iodice@arm.com>	2018-01-10 15:56:30 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:43:42 +0000
commit	7b4d547800d3ea49e7e6d9f497ec2766411cb948 (patch)
tree	efff27f22ed692314690a666dc4c0e23838b78c5 /src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
parent	dde3ad94dc11fae29dba862a1ad657f551f36763 (diff)
download	ComputeLibrary-7b4d547800d3ea49e7e6d9f497ec2766411cb948.tar.gz