aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2017-09-06 17:24:25 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commit1c8409d7ce90ea449437076574c98a4ea90d9368 (patch)
tree8bd786b274ba9d905af803b481eccfa7635053f9 /src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
parent898a806bd0c72aeafb3557efdbc686aab54992d9 (diff)
downloadComputeLibrary-1c8409d7ce90ea449437076574c98a4ea90d9368.tar.gz
COMPMID-477 - Optimized CLDirectConvolution1x1 for Bifrost
- Fixed bug in CLDirectConvolution3x3 Change-Id: Iaf34ef44f0b7bc02e66f3eb4452ff7a90ef83523 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86725 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp')
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp10
1 files changed, 9 insertions, 1 deletions
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 265c5074c5..75e6d5e971 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -108,7 +108,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
options.emplace("-DHAS_BIAS");
}
- if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (kernel_size != 1) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (input->info()->data_type() == DataType::F32))
+ if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (input->info()->data_type() == DataType::F32))
{
options.emplace("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2)));
@@ -125,6 +125,14 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
switch(kernel_size)
{
+ case 1:
+ {
+ num_elems_read_per_iteration_x = 4;
+ num_elems_read_per_iteration_y = 4;
+ num_elems_written_per_iteration_x = 4;
+ num_elems_written_per_iteration_y = 4;
+ break;
+ }
case 3:
{
num_elems_read_per_iteration_x = 6;