From 944d3f79baef6878916c1ec082a71768f0bf3409 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Tue, 16 Jan 2018 15:38:35 +0000 Subject: COMPMID-751 Processing 8 elements makes computation up to 80us faster on MobileNet QASYMM8 dwc layers Change-Id: I30eaea3f3625086e311ad201ef73a8f06a01e382 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116521 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'src/core/CL/kernels') diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp index f9229ba294..1c0fe9984f 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -158,9 +158,9 @@ void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, con } // Configure kernel window - const unsigned int num_elems_processed_per_iteration = 2; - const unsigned int num_elems_written_per_iteration = 2; - const unsigned int num_elems_read_per_iteration = 3 + _conv_stride_x; + const unsigned int num_elems_processed_per_iteration = 8 / data_size_from_type(input->info()->data_type()); + const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration; + const unsigned int num_elems_read_per_iteration = 3 + (num_elems_processed_per_iteration - 1) * _conv_stride_x; const unsigned int num_rows_read_per_iteration = 3; Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); -- cgit v1.2.1