From 7b4d547800d3ea49e7e6d9f497ec2766411cb948 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Wed, 10 Jan 2018 15:56:30 +0000 Subject: COMPMID-816 - Optimizing CLGEMMLowpMatrixMultiplyCore - Part1 The performance improvements have been reported at the following confluence page: https://confluence.arm.com/display/MLENG/GEMMLowp+performance%3A+ACL+18.02 Config3 of McVail looks improved by 29x Change-Id: I8b203c0b75fc368f85cea863b7eed398fab3e79a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/115783 Reviewed-by: Georgios Pinitas Reviewed-by: Michalis Spyrou Tested-by: Jenkins --- src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp') diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index ddcab6a256..2cd426b82d 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -52,7 +52,10 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor _b_offset = b->info()->quantization_info().offset; // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors - _is_interleaved_transposed = a->info()->dimension(1) > 16; + _is_interleaved_transposed = (a->info()->dimension(1)) > 16 && (CLScheduler::get().target() != GPUTarget::BIFROST); + + // Set the target for the matrix multiply kernel + _mm_kernel.set_target(CLScheduler::get().target()); const ICLTensor *matrix_a = a; const ICLTensor *matrix_b = b; @@ -138,7 +141,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso int32_t a_offset = a->quantization_info().offset; int32_t b_offset = b->quantization_info().offset; - bool is_interleaved_transposed = a->dimension(1) > 16; + bool is_interleaved_transposed = (a->dimension(1)) > 16 && (CLScheduler::get().target() != GPUTarget::BIFROST); if(is_interleaved_transposed) { -- cgit v1.2.1