diff options
author | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2018-04-06 10:00:10 +0100 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:49:37 +0000 |
commit | c9c62c2fa1c80ba7f11b0d0732740460dfa00e74 (patch) | |
tree | 260052aa5c7172e2afc8517ae13adb75504ee62e /arm_compute/runtime/CL/functions | |
parent | 3ab6804a0e30be4d8591c8c84ae6a73940d0f2e2 (diff) | |
download | ComputeLibrary-c9c62c2fa1c80ba7f11b0d0732740460dfa00e74.tar.gz |
COMPMID-1056 - Optimizing CLGEMMMatrixMultiplyKernel refactoring the inner loop
Results reported at:
https://confluence.arm.com/display/MLENG/GEMM+FP32+performance%3A+ACL+18.05
Change-Id: I3246c4f19c4d21a7d6a44e4593bc5caffc016f81
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/127838
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'arm_compute/runtime/CL/functions')
-rw-r--r-- | arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index 584266b824..67c0467f3a 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -27,11 +27,11 @@ #include "arm_compute/runtime/CL/ICLSimpleFunction.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" #include "arm_compute/core/CL/kernels/CLTransposeKernel.h" #include "arm_compute/runtime/CL/CLMemoryGroup.h" #include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLGEMM.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" @@ -113,12 +113,12 @@ public: private: void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); - void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed = true); + void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); CLMemoryGroup _memory_group; CLIm2ColKernel _im2col_kernel; CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel; - CLGEMMMatrixMultiplyKernel _mm_kernel; + CLGEMM _mm_gemm; CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; |