diff options
author | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2018-12-12 10:18:04 +0000 |
---|---|---|
committer | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2018-12-14 14:57:48 +0000 |
commit | bf9731edfa0439cad4d70efc3065e71e199c62b8 (patch) | |
tree | 71340a3d04a6294744c642ed6e4a56c0e8a77592 /arm_compute/runtime/CL | |
parent | 92e278d5f462c930af1947883a5f48c10586ae9c (diff) | |
download | ComputeLibrary-bf9731edfa0439cad4d70efc3065e71e199c62b8.tar.gz |
COMPMID-1687: Optimize CLGEMMMatrixMultiplyKernel for Mali-G76 - Part1
The current implementation is limited just to FP32
Change-Id: I185ab57e483e879d7c301e9cc3033efc8b41e244
Reviewed-on: https://review.mlplatform.org/389
Reviewed-by: Anthony Barbier <Anthony.barbier@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Diffstat (limited to 'arm_compute/runtime/CL')
-rw-r--r-- | arm_compute/runtime/CL/functions/CLGEMM.h | 40 |
1 files changed, 25 insertions, 15 deletions
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h index c4513f29d9..7d47194e56 100644 --- a/arm_compute/runtime/CL/functions/CLGEMM.h +++ b/arm_compute/runtime/CL/functions/CLGEMM.h @@ -27,6 +27,9 @@ #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h" #include "arm_compute/runtime/CL/CLMemoryGroup.h" #include "arm_compute/runtime/CL/CLTensor.h" @@ -39,9 +42,12 @@ class ICLTensor; /** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels: * - * -# @ref CLGEMMInterleave4x4Kernel (only if the reshaped GEMM is selected by the heuristic model) - * -# @ref CLGEMMTranspose1xWKernel (only if the reshaped GEMM is selected by the heuristic model) - * -# @ref CLGEMMMatrixMultiplyKernel + * -# @ref CLGEMMInterleave4x4Kernel (only if the reshaped GEMM is selected by the heuristic model and the GPU target is NOT Mali-G76) + * -# @ref CLGEMMReshapeLHSMatrixKernel (only if the reshaped GEMM is selected by the heuristic model and the GPU target IS Mali-G76) + * -# @ref CLGEMMTranspose1xWKernel (only if the reshaped GEMM is selected by the heuristic model and the GPU target is NOT Mali-G76) + * -# @ref CLGEMMReshapeRHSMatrixKernel (only if the reshaped GEMM is selected by the heuristic model and the GPU target IS Mali-G76) + * -# @ref CLGEMMMatrixMultiplyKernel (if GPU target is NOT G76 or if the reshaped GEMM is NOT selected) + * -# @ref CLGEMMMatrixMultiplyReshapedKernel (only if the reshaped GEMM is selected by the heuristic model and the GPU target IS Mali-G76) * -# @ref CLGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0) * */ @@ -100,18 +106,22 @@ public: void prepare() override; private: - CLMemoryGroup _memory_group; - CLGEMMInterleave4x4Kernel _interleave_kernel; - CLGEMMTranspose1xWKernel _transpose_kernel; - CLGEMMMatrixMultiplyKernel _mm_kernel; - CLGEMMMatrixAdditionKernel _ma_kernel; - CLTensor _tmp_a; - CLTensor _tmp_b; - const ICLTensor *_original_b; - bool _is_interleaved_transposed; - bool _run_addition; - bool _reshape_b_only_on_first_run; - bool _is_prepared; + CLMemoryGroup _memory_group; + CLGEMMInterleave4x4Kernel _interleave_kernel; // TODO - COMPMID-1835: Remove this kernel and use CLGEMMReshapeLHSMatrixKernel + CLGEMMTranspose1xWKernel _transpose_kernel; // TODO - COMPMID-1836: Remove this kernel and use CLGEMMReshapeRHSMatrixKernel + CLGEMMMatrixMultiplyKernel _mm_kernel; + CLGEMMMatrixAdditionKernel _ma_kernel; + CLGEMMReshapeLHSMatrixKernel _reshape_lhs_kernel; + CLGEMMReshapeRHSMatrixKernel _reshape_rhs_kernel; + CLGEMMMatrixMultiplyReshapedKernel _mm_reshaped_kernel; + CLTensor _tmp_a; + CLTensor _tmp_b; + const ICLTensor *_original_b; + bool _is_interleaved_transposed; + bool _run_addition; + bool _reshape_b_only_on_first_run; + bool _is_prepared; + bool _is_G76_path; // TODO: To be removed once completed COMPMID-1835 and COMPMID-1836 }; } |