diff options
author | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2017-09-06 17:24:25 +0100 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:35:24 +0000 |
commit | 1c8409d7ce90ea449437076574c98a4ea90d9368 (patch) | |
tree | 8bd786b274ba9d905af803b481eccfa7635053f9 /src/core/CL/kernels | |
parent | 898a806bd0c72aeafb3557efdbc686aab54992d9 (diff) | |
download | ComputeLibrary-1c8409d7ce90ea449437076574c98a4ea90d9368.tar.gz |
COMPMID-477 - Optimized CLDirectConvolution1x1 for Bifrost
- Fixed bug in CLDirectConvolution3x3
Change-Id: Iaf34ef44f0b7bc02e66f3eb4452ff7a90ef83523
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86725
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
Diffstat (limited to 'src/core/CL/kernels')
-rw-r--r-- | src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp index 265c5074c5..75e6d5e971 100644 --- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp +++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp @@ -108,7 +108,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL options.emplace("-DHAS_BIAS"); } - if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (kernel_size != 1) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (input->info()->data_type() == DataType::F32)) + if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (input->info()->data_type() == DataType::F32)) { options.emplace("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))); @@ -125,6 +125,14 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL switch(kernel_size) { + case 1: + { + num_elems_read_per_iteration_x = 4; + num_elems_read_per_iteration_y = 4; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 4; + break; + } case 3: { num_elems_read_per_iteration_x = 6; |