aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2019-06-13 15:58:32 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-07-05 11:41:58 +0000
commit95f9361416a0db7fee2e35bd0f7afbecbb7352c2 (patch)
tree2e41341d19dac1108069acf79c7c364bec9b7e16 /src/core/NEON/kernels
parent53860dd80fd6e91ce04756032a969d4b37e00a61 (diff)
downloadComputeLibrary-95f9361416a0db7fee2e35bd0f7afbecbb7352c2.tar.gz
COMPMID-2185: Implement direct convolution 9x9 (NHWC) on NEON
Change-Id: I3f47947bdbc80946668165c8fc2ce52eca7722a6 Signed-off-by: giuros01 <giuseppe.rossini@arm.com> Reviewed-on: https://review.mlplatform.org/c/1409 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core/NEON/kernels')
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp344
1 files changed, 333 insertions, 11 deletions
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index d557cfa1bd..bcf70b3ad8 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -865,6 +865,256 @@ public:
}
};
+inline void convolve_row1x9_nhwc(const float *row_ptr, const float *weights_ptr, size_t src_stride_y, size_t weights_stride_y,
+ float32x4_t &acc0, float32x4_t &acc1, float32x4_t &acc2, float32x4_t &acc3)
+{
+ // Load 4 channels for each of the 12 inputs values along the same X spatial dimension
+ const float32x4_t src0 = wrapper::vloadq(row_ptr);
+ const float32x4_t src1 = wrapper::vloadq(row_ptr + 1 * src_stride_y);
+ const float32x4_t src2 = wrapper::vloadq(row_ptr + 2 * src_stride_y);
+ const float32x4_t src3 = wrapper::vloadq(row_ptr + 3 * src_stride_y);
+ const float32x4_t src4 = wrapper::vloadq(row_ptr + 4 * src_stride_y);
+ const float32x4_t src5 = wrapper::vloadq(row_ptr + 5 * src_stride_y);
+ const float32x4_t src6 = wrapper::vloadq(row_ptr + 6 * src_stride_y);
+ const float32x4_t src7 = wrapper::vloadq(row_ptr + 7 * src_stride_y);
+ const float32x4_t src8 = wrapper::vloadq(row_ptr + 8 * src_stride_y);
+ const float32x4_t src9 = wrapper::vloadq(row_ptr + 9 * src_stride_y);
+ const float32x4_t src10 = wrapper::vloadq(row_ptr + 10 * src_stride_y);
+ const float32x4_t src11 = wrapper::vloadq(row_ptr + 11 * src_stride_y);
+
+ // Load 4 channels for each of the 9 weights values along the same X spatial dimension
+ const float32x4_t w0 = wrapper::vloadq(weights_ptr);
+ const float32x4_t w1 = wrapper::vloadq(weights_ptr + 1 * weights_stride_y);
+ const float32x4_t w2 = wrapper::vloadq(weights_ptr + 2 * weights_stride_y);
+ const float32x4_t w3 = wrapper::vloadq(weights_ptr + 3 * weights_stride_y);
+ const float32x4_t w4 = wrapper::vloadq(weights_ptr + 4 * weights_stride_y);
+ const float32x4_t w5 = wrapper::vloadq(weights_ptr + 5 * weights_stride_y);
+ const float32x4_t w6 = wrapper::vloadq(weights_ptr + 6 * weights_stride_y);
+ const float32x4_t w7 = wrapper::vloadq(weights_ptr + 7 * weights_stride_y);
+ const float32x4_t w8 = wrapper::vloadq(weights_ptr + 8 * weights_stride_y);
+
+ // Store 4 channels for each of the 4 output values along the same X spatial dimension
+ acc0 = wrapper::vmla(acc0, w0, src0);
+ acc0 = wrapper::vmla(acc0, w1, src1);
+ acc0 = wrapper::vmla(acc0, w2, src2);
+ acc0 = wrapper::vmla(acc0, w3, src3);
+ acc0 = wrapper::vmla(acc0, w4, src4);
+ acc0 = wrapper::vmla(acc0, w5, src5);
+ acc0 = wrapper::vmla(acc0, w6, src6);
+ acc0 = wrapper::vmla(acc0, w7, src7);
+ acc0 = wrapper::vmla(acc0, w8, src8);
+
+ acc1 = wrapper::vmla(acc1, w0, src1);
+ acc1 = wrapper::vmla(acc1, w1, src2);
+ acc1 = wrapper::vmla(acc1, w2, src3);
+ acc1 = wrapper::vmla(acc1, w3, src4);
+ acc1 = wrapper::vmla(acc1, w4, src5);
+ acc1 = wrapper::vmla(acc1, w5, src6);
+ acc1 = wrapper::vmla(acc1, w6, src7);
+ acc1 = wrapper::vmla(acc1, w7, src8);
+ acc1 = wrapper::vmla(acc1, w8, src9);
+
+ acc2 = wrapper::vmla(acc2, w0, src2);
+ acc2 = wrapper::vmla(acc2, w1, src3);
+ acc2 = wrapper::vmla(acc2, w2, src4);
+ acc2 = wrapper::vmla(acc2, w3, src5);
+ acc2 = wrapper::vmla(acc2, w4, src6);
+ acc2 = wrapper::vmla(acc2, w5, src7);
+ acc2 = wrapper::vmla(acc2, w6, src8);
+ acc2 = wrapper::vmla(acc2, w7, src9);
+ acc2 = wrapper::vmla(acc2, w8, src10);
+
+ acc3 = wrapper::vmla(acc3, w0, src3);
+ acc3 = wrapper::vmla(acc3, w1, src4);
+ acc3 = wrapper::vmla(acc3, w2, src5);
+ acc3 = wrapper::vmla(acc3, w3, src6);
+ acc3 = wrapper::vmla(acc3, w4, src7);
+ acc3 = wrapper::vmla(acc3, w5, src8);
+ acc3 = wrapper::vmla(acc3, w6, src9);
+ acc3 = wrapper::vmla(acc3, w7, src10);
+ acc3 = wrapper::vmla(acc3, w8, src11);
+}
+
+float vreduce(const float32x4_t &v)
+{
+ auto v0 = wrapper::vgethigh(v);
+ auto v1 = wrapper::vgetlow(v);
+ auto v_out = wrapper::vadd(v0, v1);
+
+ float a = wrapper::vgetlane(v_out, 0);
+ float b = wrapper::vgetlane(v_out, 1);
+ return a + b;
+}
+
+template <typename V>
+class convolver_9x9_nhwc
+{
+public:
+ static void convolve(const Window &window, unsigned int num_elems_read_per_iteration,
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ {
+ // Declare useful types
+ using vector_type = typename V::type;
+ using scalar_type = typename V::scalar_type;
+ using tag_type = typename V::tag_type;
+
+ // Scalar quantities
+ const int element_size = input->info()->element_size();
+ const int input_width = input->info()->dimension(0);
+ const int input_depth = input->info()->dimension(2);
+ const int input_stride_y = input->info()->strides_in_bytes().y() / element_size;
+ const int input_stride_z = input->info()->strides_in_bytes().z() / element_size;
+ const int input_stride_w = input->info()->strides_in_bytes()[3];
+ const int output_stride_x = output->info()->strides_in_bytes().x();
+ const int output_stride_y = output->info()->strides_in_bytes().y();
+ const int kernel_stride_y = weights->info()->strides_in_bytes().y() / element_size;
+ const int kernel_stride_z = weights->info()->strides_in_bytes().z() / element_size;
+ const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+ const unsigned int conv_pad_top = conv_info.pad_top();
+ const unsigned int conv_pad_left = conv_info.pad_left();
+
+ // Setup input window for the input iterator
+ Window window_in = window;
+ window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ // Setup input window for the output iterator
+ Window window_out = window;
+ window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Setup input window for the weights iterator
+ Window window_k = calculate_max_window(*weights->info(), Steps());
+ window_k.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_k.set(Window::DimY, Window::Dimension(0, 1, 1));
+ window_k.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ window_k.set(3, Window::Dimension(0, weights->info()->dimension(3), 1));
+
+ Iterator in(input, window_in);
+ Iterator out(output, window_out);
+ Iterator k(weights, window_k);
+
+ // Calculate the max_offset.
+ // max_offset is the offset for the last NOT valid value in the Z dimension (spatial dimension Y for NHWC)
+ // |******************|
+ // | pad_top |
+ // |******************|
+ // | |
+ // | plane0 |
+ // | batch0 |
+ // |__________________|
+ // |******************| Batch 0
+ // | pad_bottom |
+ // | pad_top |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | batch0 |
+ // |__________________|-----> max_offset
+ // |******************|
+ // | pad_bottom |
+ // | pad_top |
+ // |******************|
+ // | |
+ // | plane0 |
+ // | batch1 |
+ // |__________________|
+ // |******************| Batch 1
+ // | pad_bottom |
+ // | pad_top |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | batch1 |
+ // |__________________|
+ // | pad_bottom |
+ // |******************|
+ const int max_offset = input_stride_z * input_depth - (input->info()->padding().bottom + input->info()->padding().top) * input_stride_y;
+ execute_window_loop(window_k, [&](const Coordinates & id_k) // loop on the batch size
+ {
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const auto y_offset = int(id.y() - conv_pad_left) * input_stride_y;
+
+ // Buffer pointers
+ const scalar_type *in_ptr = reinterpret_cast<scalar_type *>(input->buffer() + input->info()->offset_first_element_in_bytes() + id[3] * input_stride_w);
+ const scalar_type *weights_ptr = reinterpret_cast<scalar_type *>(k.ptr());
+ uint8_t *out_ptr = out.ptr() + id_k[3] * output_stride_x;
+
+ // Output elements
+ vector_type out0 = wrapper::vdup_n(scalar_type(0), tag_type());
+ vector_type out1 = wrapper::vdup_n(scalar_type(0), tag_type());
+ vector_type out2 = wrapper::vdup_n(scalar_type(0), tag_type());
+ vector_type out3 = wrapper::vdup_n(scalar_type(0), tag_type());
+
+ // Reduce along the feature maps
+ for(int x = 0; x < input_width; x += num_elems_read_per_iteration)
+ {
+ // z == 0
+ auto in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top);
+ in_z = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
+ auto offset = y_offset + in_z * input_stride_z;
+ offset = std::min(offset, max_offset);
+ convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 0 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+ // z == 1
+ in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 1);
+ in_z = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
+ offset = y_offset + in_z * input_stride_z;
+ offset = std::min(offset, max_offset);
+ convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 1 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+ // z == 2
+ in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 2);
+ in_z = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
+ offset = y_offset + in_z * input_stride_z;
+ offset = std::min(offset, max_offset);
+ convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 2 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+ // z == 3
+ in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 3);
+ offset = y_offset + in_z * input_stride_z;
+ offset = std::min(offset, max_offset);
+ convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 3 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+ // z == 4
+ in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 4);
+ offset = y_offset + in_z * input_stride_z;
+ convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 4 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+ // z == 5
+ offset += input_stride_z;
+ offset = std::min(offset, max_offset);
+ convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 5 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+ // z == 6
+ offset += input_stride_z;
+ offset = std::min(offset, max_offset);
+ convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 6 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+ // z == 7
+ offset += input_stride_z;
+ offset = std::min(offset, max_offset);
+ convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 7 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+
+ // z == 8
+ offset += input_stride_z;
+ offset = std::min(offset, max_offset);
+ convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 8 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
+ }
+
+ *(reinterpret_cast<scalar_type *>(out_ptr + 0 * output_stride_y)) = vreduce(out0);
+ *(reinterpret_cast<scalar_type *>(out_ptr + 1 * output_stride_y)) = vreduce(out1);
+ *(reinterpret_cast<scalar_type *>(out_ptr + 2 * output_stride_y)) = vreduce(out2);
+ *(reinterpret_cast<scalar_type *>(out_ptr + 3 * output_stride_y)) = vreduce(out3);
+ },
+ in, out);
+ },
+ k);
+ }
+};
+
template <typename T1, typename T2>
inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
@@ -969,6 +1219,21 @@ inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_i
}
}
+template <typename V>
+inline void convolve_9x9_nhwc(const Window &window, unsigned int num_elems_read_per_iteration,
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+ const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+ switch(conv_stride_x)
+ {
+ case 1:
+ convolver_9x9_nhwc<V>::convolve(window, num_elems_read_per_iteration, input, weights, output, conv_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+}
+
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
@@ -1122,18 +1387,62 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
else
{
- border_size.left = 0;
- border_size.top = conv_info.pad_left();
- border_size.right = 0;
- border_size.bottom = conv_info.pad_right();
+ if(kernel_size == 9)
+ {
+ border_size.left = 0;
+ border_size.top = conv_info.pad_left();
+
+ const int num_elems_read_per_iteration_x = 4;
+ const int num_elems_written_per_iteration_x = 1;
+ const int num_elems_read_per_iteration_y = 12;
+ const int num_elems_written_per_iteration_y = 4;
+
+ num_elems_read_per_iteration = num_elems_read_per_iteration_x;
+ num_elems_written_per_iteration = num_elems_written_per_iteration_x;
+
+ border_size.right = num_elems_read_per_iteration_x;
+ if((conv_info.pad_bottom() != 0) || (conv_info.pad_top() != 0))
+ {
+ // If bottom or top padding are set, we need to read num_elems_read_per_iteration_y rows to zero.
+ // Since num_elems_read_per_iteration_y is always greater than conv_info.pad_right() we can set
+ // the bottom padding to num_elems_read_per_iteration_y
+ border_size.bottom = num_elems_read_per_iteration_y;
+ }
+ else if(conv_info.pad_right() != 0)
+ {
+ // Convetional border padding. Fill the bottom paddings so that we can read in batch of num_elems_read_per_iteration_y
+ border_size.bottom = ceil_to_multiple(input->dimension(1) + conv_info.pad_right(), num_elems_read_per_iteration_y) - input->dimension(1);
+ }
+ else
+ {
+ // No padding
+ border_size.bottom = 0;
+ }
- num_elems_read_per_iteration = 16 / element_size_from_data_type(input->data_type());
+ win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
- win = calculate_max_window(*output, Steps());
+ AccessWindowStatic input_access(input, 0, -border_size.top,
+ ceil_to_multiple(input->dimension(0), num_elems_read_per_iteration_x),
+ input->dimension(1) + border_size.bottom);
- AccessWindowRectangle input_access(input, 0, -border_size.top, num_elems_read_per_iteration, kernel_size, 1.f, conv_stride_x);
- AccessWindowRectangle weights_access(weights, 0, 0, num_elems_read_per_iteration, kernel_size);
- window_changed = update_window_and_padding(win, input_access, weights_access);
+ AccessWindowStatic weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_read_per_iteration_x), weights->dimension(1));
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+ window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+ else
+ {
+ border_size.left = 0;
+ border_size.top = conv_info.pad_left();
+ border_size.right = 0;
+ border_size.bottom = conv_info.pad_right();
+ num_elems_read_per_iteration = 16 / element_size_from_data_type(input->data_type());
+ win = calculate_max_window(*output, Steps());
+
+ AccessWindowRectangle input_access(input, 0, -border_size.top, num_elems_read_per_iteration, kernel_size, 1.f, conv_stride_x);
+ AccessWindowRectangle weights_access(weights, 0, 0, num_elems_read_per_iteration, kernel_size);
+ window_changed = update_window_and_padding(win, input_access, weights_access);
+ }
}
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -1268,7 +1577,6 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
}
break;
}
-
default:
{
ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
@@ -1278,11 +1586,25 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
}
else
{
+ const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_weights->info()->data_layout(), DataLayoutDimension::WIDTH));
+ const int stride_x = std::get<0>(_conv_info.stride());
+ const int stride_y = std::get<1>(_conv_info.stride());
+
switch(_input->info()->data_type())
{
case DataType::F32:
- convolver_nhwc<float>::convolve(window, kernel_size, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info);
+ {
+ if(kernel_size == 9 && stride_x == 1 && stride_y == 1)
+ {
+ using vtype = wrapper::traits::neon_vector<float, 4>;
+ convolve_9x9_nhwc<vtype>(window, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info);
+ }
+ else
+ {
+ convolver_nhwc<float>::convolve(window, kernel_size, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info);
+ }
break;
+ }
default:
ARM_COMPUTE_ERROR("Data type not supported");
break;