aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
diff options
context:
space:
mode:
authorAnthony Barbier <anthony.barbier@arm.com>2017-12-12 17:17:50 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:42:33 +0000
commit1568621e07cef67c5bb01fa4cc827e218302040c (patch)
treecbe124f50043be3e1aa64abafe5d68a93cbbd305 /src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
parent93b9bdb49f0f1e715c7ad251b6886c1a49945b5a (diff)
downloadComputeLibrary-1568621e07cef67c5bb01fa4cc827e218302040c.tar.gz
COMPMID-744 Fixed access windows in DepthwiseConvolutionLayer
Change-Id: I7bf4b5b85ce1e89006906db59403d06580e2e810 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/112974 Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp')
-rw-r--r--src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp25
1 files changed, 14 insertions, 11 deletions
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 02962e0492..dd5c44801e 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -60,7 +60,6 @@ void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const
weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(),
conv_info);
- ARM_COMPUTE_UNUSED(expected_output);
ARM_COMPUTE_ERROR_ON(expected_output.first != output->info()->tensor_shape().x());
ARM_COMPUTE_ERROR_ON(expected_output.second != output->info()->tensor_shape().y());
@@ -69,6 +68,7 @@ void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const
_weights = weights;
_conv_info = conv_info;
const unsigned int conv_stride_x = conv_info.stride().first;
+ const unsigned int conv_stride_y = conv_info.stride().second;
const unsigned int conv_pad_x = conv_info.pad().first;
const unsigned int conv_pad_y = conv_info.pad().second;
@@ -80,9 +80,12 @@ void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration));
- AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, input->info()->dimension(0) + _border_size.right, input->info()->dimension(1) + _border_size.bottom);
- AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+ const unsigned int num_x_steps = (expected_output.first + num_elems_written_per_iteration - 1) / num_elems_written_per_iteration;
+ const int input_num_elems_processed = get_input_num_elems_processed(num_elems_written_per_iteration, conv_stride_x);
+
+ AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, (num_x_steps - 1) * input_num_elems_processed + 12, conv_stride_y * (expected_output.second - 1) + 2);
+ AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
+ AccessWindowStatic output_access(output->info(), 0, 0, num_x_steps * num_elems_written_per_iteration, expected_output.second);
update_window_and_padding(win, input_access, weights_access, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
@@ -134,13 +137,13 @@ public:
int ih = 0;
int oh = 0;
- const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
- const auto ptr_weights_r0 = reinterpret_cast<const float *>(ptr_weights_base);
- const auto ptr_weights_r1 = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y);
- const auto ptr_weights_r2 = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y * 2);
- const auto vw_r0 = load_matrix_row(ptr_weights_r0);
- const auto vw_r1 = load_matrix_row(ptr_weights_r1);
- const auto vw_r2 = load_matrix_row(ptr_weights_r2);
+ const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
+ const auto ptr_weights_r0 = reinterpret_cast<const float *>(ptr_weights_base);
+ const auto ptr_weights_r1 = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y);
+ const auto ptr_weights_r2 = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y * 2);
+ const float32x4x3_t vw_r0 = load_matrix_row(ptr_weights_r0);
+ const float32x4x3_t vw_r1 = load_matrix_row(ptr_weights_r1);
+ const float32x4x3_t vw_r2 = load_matrix_row(ptr_weights_r2);
for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
{