diff options
Diffstat (limited to 'src/runtime/CL')
-rw-r--r-- | src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 4 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLGEMM.cpp | 60 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 4 |
3 files changed, 55 insertions, 13 deletions
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 68c6576a79..e9d14db96e 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -55,7 +55,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, gpu_target)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, GEMMReshapeInfo(), gpu_target)); } return Status{}; diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index c676a10978..a09849ab93 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -38,6 +38,30 @@ using namespace arm_compute; +namespace +{ +inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target) +{ + bool flag = true; + + if(gpu_target == GPUTarget::BIFROST) + { + // COMPMID-852 + if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run) + { + const float scale = k < 1024 ? 2.0f : 2.5f; + flag = scale * n > 1.66f * n + 38.4f; + } + else + { + flag = false; + } + } + + return flag; +} +} // namespace + CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false) @@ -62,18 +86,36 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors - // For Bifrost architectures we do not reshape the input matrices - _is_interleaved_transposed = (a->info()->dimension(1) > 16 && CLScheduler::get().target() != GPUTarget::BIFROST); - // Check if we need to reshape the matrix B only on the first run _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); const ICLTensor *matrix_a = a; const ICLTensor *matrix_b = b; - // Set the target for the matrix multiply kernel - _mm_kernel.set_target(CLScheduler::get().target()); + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + + // Set the target for the kernels + _interleave_kernel.set_target(gpu_target); + _mm_kernel.set_target(gpu_target); + + // Arguments used by GEMMReshapeInfo + // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo + // in order to know how the matrices have been reshaped + const int m = a->info()->dimension(1); + const int n = b->info()->dimension(0); + const int k = a->info()->dimension(0); + int mult_transpose1xW_width = 1; + int mult_interleave4x4_height = 1; + + if(gpu_target == GPUTarget::BIFROST) + { + mult_transpose1xW_width = 4; + mult_interleave4x4_height = 2; + } + + // Check if we need to reshape the matrix A and matrix B + _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target); if(_is_interleaved_transposed) { @@ -83,17 +125,17 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel // Configure interleave kernel - _interleave_kernel.configure(a, &_tmp_a); + _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height); // Configure transpose kernel - _transpose_kernel.configure(b, &_tmp_b); + _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width); // Manage intermediate buffers _memory_group.manage(&_tmp_a); _memory_group.manage(&_tmp_b); } - _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed); + _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height)); if(_is_interleaved_transposed) { diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 2cd426b82d..5f886a02c6 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -148,8 +148,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso TensorInfo info_a(compute_interleaved_shape(*a), 1, a->data_type()); TensorInfo info_b(compute_transpose1xW_shape(*b), 1, b->data_type()); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a, 1)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b, 1)); ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output)); } else |