From 95f9361416a0db7fee2e35bd0f7afbecbb7352c2 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 13 Jun 2019 15:58:32 +0100 Subject: COMPMID-2185: Implement direct convolution 9x9 (NHWC) on NEON Change-Id: I3f47947bdbc80946668165c8fc2ce52eca7722a6 Signed-off-by: giuros01 Reviewed-on: https://review.mlplatform.org/c/1409 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../kernels/NEDirectConvolutionLayerKernel.cpp | 344 ++++++++++++++++++++- 1 file changed, 333 insertions(+), 11 deletions(-) (limited to 'src/core/NEON/kernels') diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index d557cfa1bd..bcf70b3ad8 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -865,6 +865,256 @@ public: } }; +inline void convolve_row1x9_nhwc(const float *row_ptr, const float *weights_ptr, size_t src_stride_y, size_t weights_stride_y, + float32x4_t &acc0, float32x4_t &acc1, float32x4_t &acc2, float32x4_t &acc3) +{ + // Load 4 channels for each of the 12 inputs values along the same X spatial dimension + const float32x4_t src0 = wrapper::vloadq(row_ptr); + const float32x4_t src1 = wrapper::vloadq(row_ptr + 1 * src_stride_y); + const float32x4_t src2 = wrapper::vloadq(row_ptr + 2 * src_stride_y); + const float32x4_t src3 = wrapper::vloadq(row_ptr + 3 * src_stride_y); + const float32x4_t src4 = wrapper::vloadq(row_ptr + 4 * src_stride_y); + const float32x4_t src5 = wrapper::vloadq(row_ptr + 5 * src_stride_y); + const float32x4_t src6 = wrapper::vloadq(row_ptr + 6 * src_stride_y); + const float32x4_t src7 = wrapper::vloadq(row_ptr + 7 * src_stride_y); + const float32x4_t src8 = wrapper::vloadq(row_ptr + 8 * src_stride_y); + const float32x4_t src9 = wrapper::vloadq(row_ptr + 9 * src_stride_y); + const float32x4_t src10 = wrapper::vloadq(row_ptr + 10 * src_stride_y); + const float32x4_t src11 = wrapper::vloadq(row_ptr + 11 * src_stride_y); + + // Load 4 channels for each of the 9 weights values along the same X spatial dimension + const float32x4_t w0 = wrapper::vloadq(weights_ptr); + const float32x4_t w1 = wrapper::vloadq(weights_ptr + 1 * weights_stride_y); + const float32x4_t w2 = wrapper::vloadq(weights_ptr + 2 * weights_stride_y); + const float32x4_t w3 = wrapper::vloadq(weights_ptr + 3 * weights_stride_y); + const float32x4_t w4 = wrapper::vloadq(weights_ptr + 4 * weights_stride_y); + const float32x4_t w5 = wrapper::vloadq(weights_ptr + 5 * weights_stride_y); + const float32x4_t w6 = wrapper::vloadq(weights_ptr + 6 * weights_stride_y); + const float32x4_t w7 = wrapper::vloadq(weights_ptr + 7 * weights_stride_y); + const float32x4_t w8 = wrapper::vloadq(weights_ptr + 8 * weights_stride_y); + + // Store 4 channels for each of the 4 output values along the same X spatial dimension + acc0 = wrapper::vmla(acc0, w0, src0); + acc0 = wrapper::vmla(acc0, w1, src1); + acc0 = wrapper::vmla(acc0, w2, src2); + acc0 = wrapper::vmla(acc0, w3, src3); + acc0 = wrapper::vmla(acc0, w4, src4); + acc0 = wrapper::vmla(acc0, w5, src5); + acc0 = wrapper::vmla(acc0, w6, src6); + acc0 = wrapper::vmla(acc0, w7, src7); + acc0 = wrapper::vmla(acc0, w8, src8); + + acc1 = wrapper::vmla(acc1, w0, src1); + acc1 = wrapper::vmla(acc1, w1, src2); + acc1 = wrapper::vmla(acc1, w2, src3); + acc1 = wrapper::vmla(acc1, w3, src4); + acc1 = wrapper::vmla(acc1, w4, src5); + acc1 = wrapper::vmla(acc1, w5, src6); + acc1 = wrapper::vmla(acc1, w6, src7); + acc1 = wrapper::vmla(acc1, w7, src8); + acc1 = wrapper::vmla(acc1, w8, src9); + + acc2 = wrapper::vmla(acc2, w0, src2); + acc2 = wrapper::vmla(acc2, w1, src3); + acc2 = wrapper::vmla(acc2, w2, src4); + acc2 = wrapper::vmla(acc2, w3, src5); + acc2 = wrapper::vmla(acc2, w4, src6); + acc2 = wrapper::vmla(acc2, w5, src7); + acc2 = wrapper::vmla(acc2, w6, src8); + acc2 = wrapper::vmla(acc2, w7, src9); + acc2 = wrapper::vmla(acc2, w8, src10); + + acc3 = wrapper::vmla(acc3, w0, src3); + acc3 = wrapper::vmla(acc3, w1, src4); + acc3 = wrapper::vmla(acc3, w2, src5); + acc3 = wrapper::vmla(acc3, w3, src6); + acc3 = wrapper::vmla(acc3, w4, src7); + acc3 = wrapper::vmla(acc3, w5, src8); + acc3 = wrapper::vmla(acc3, w6, src9); + acc3 = wrapper::vmla(acc3, w7, src10); + acc3 = wrapper::vmla(acc3, w8, src11); +} + +float vreduce(const float32x4_t &v) +{ + auto v0 = wrapper::vgethigh(v); + auto v1 = wrapper::vgetlow(v); + auto v_out = wrapper::vadd(v0, v1); + + float a = wrapper::vgetlane(v_out, 0); + float b = wrapper::vgetlane(v_out, 1); + return a + b; +} + +template +class convolver_9x9_nhwc +{ +public: + static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, + const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + { + // Declare useful types + using vector_type = typename V::type; + using scalar_type = typename V::scalar_type; + using tag_type = typename V::tag_type; + + // Scalar quantities + const int element_size = input->info()->element_size(); + const int input_width = input->info()->dimension(0); + const int input_depth = input->info()->dimension(2); + const int input_stride_y = input->info()->strides_in_bytes().y() / element_size; + const int input_stride_z = input->info()->strides_in_bytes().z() / element_size; + const int input_stride_w = input->info()->strides_in_bytes()[3]; + const int output_stride_x = output->info()->strides_in_bytes().x(); + const int output_stride_y = output->info()->strides_in_bytes().y(); + const int kernel_stride_y = weights->info()->strides_in_bytes().y() / element_size; + const int kernel_stride_z = weights->info()->strides_in_bytes().z() / element_size; + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const unsigned int conv_pad_top = conv_info.pad_top(); + const unsigned int conv_pad_left = conv_info.pad_left(); + + // Setup input window for the input iterator + Window window_in = window; + window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_k = calculate_max_window(*weights->info(), Steps()); + window_k.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_k.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_k.set(Window::DimZ, Window::Dimension(0, 1, 1)); + window_k.set(3, Window::Dimension(0, weights->info()->dimension(3), 1)); + + Iterator in(input, window_in); + Iterator out(output, window_out); + Iterator k(weights, window_k); + + // Calculate the max_offset. + // max_offset is the offset for the last NOT valid value in the Z dimension (spatial dimension Y for NHWC) + // |******************| + // | pad_top | + // |******************| + // | | + // | plane0 | + // | batch0 | + // |__________________| + // |******************| Batch 0 + // | pad_bottom | + // | pad_top | + // |******************| + // | | + // | plane1 | + // | batch0 | + // |__________________|-----> max_offset + // |******************| + // | pad_bottom | + // | pad_top | + // |******************| + // | | + // | plane0 | + // | batch1 | + // |__________________| + // |******************| Batch 1 + // | pad_bottom | + // | pad_top | + // |******************| + // | | + // | plane1 | + // | batch1 | + // |__________________| + // | pad_bottom | + // |******************| + const int max_offset = input_stride_z * input_depth - (input->info()->padding().bottom + input->info()->padding().top) * input_stride_y; + execute_window_loop(window_k, [&](const Coordinates & id_k) // loop on the batch size + { + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const auto y_offset = int(id.y() - conv_pad_left) * input_stride_y; + + // Buffer pointers + const scalar_type *in_ptr = reinterpret_cast(input->buffer() + input->info()->offset_first_element_in_bytes() + id[3] * input_stride_w); + const scalar_type *weights_ptr = reinterpret_cast(k.ptr()); + uint8_t *out_ptr = out.ptr() + id_k[3] * output_stride_x; + + // Output elements + vector_type out0 = wrapper::vdup_n(scalar_type(0), tag_type()); + vector_type out1 = wrapper::vdup_n(scalar_type(0), tag_type()); + vector_type out2 = wrapper::vdup_n(scalar_type(0), tag_type()); + vector_type out3 = wrapper::vdup_n(scalar_type(0), tag_type()); + + // Reduce along the feature maps + for(int x = 0; x < input_width; x += num_elems_read_per_iteration) + { + // z == 0 + auto in_z = static_cast(id.z() * conv_stride_y - conv_pad_top); + in_z = std::min(static_cast(in_z), static_cast(input_depth)); + auto offset = y_offset + in_z * input_stride_z; + offset = std::min(offset, max_offset); + convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 0 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); + + // z == 1 + in_z = static_cast(id.z() * conv_stride_y - conv_pad_top + 1); + in_z = std::min(static_cast(in_z), static_cast(input_depth)); + offset = y_offset + in_z * input_stride_z; + offset = std::min(offset, max_offset); + convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 1 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); + + // z == 2 + in_z = static_cast(id.z() * conv_stride_y - conv_pad_top + 2); + in_z = std::min(static_cast(in_z), static_cast(input_depth)); + offset = y_offset + in_z * input_stride_z; + offset = std::min(offset, max_offset); + convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 2 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); + + // z == 3 + in_z = static_cast(id.z() * conv_stride_y - conv_pad_top + 3); + offset = y_offset + in_z * input_stride_z; + offset = std::min(offset, max_offset); + convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 3 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); + + // z == 4 + in_z = static_cast(id.z() * conv_stride_y - conv_pad_top + 4); + offset = y_offset + in_z * input_stride_z; + convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 4 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); + + // z == 5 + offset += input_stride_z; + offset = std::min(offset, max_offset); + convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 5 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); + + // z == 6 + offset += input_stride_z; + offset = std::min(offset, max_offset); + convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 6 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); + + // z == 7 + offset += input_stride_z; + offset = std::min(offset, max_offset); + convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 7 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); + + // z == 8 + offset += input_stride_z; + offset = std::min(offset, max_offset); + convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 8 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); + } + + *(reinterpret_cast(out_ptr + 0 * output_stride_y)) = vreduce(out0); + *(reinterpret_cast(out_ptr + 1 * output_stride_y)) = vreduce(out1); + *(reinterpret_cast(out_ptr + 2 * output_stride_y)) = vreduce(out2); + *(reinterpret_cast(out_ptr + 3 * output_stride_y)) = vreduce(out3); + }, + in, out); + }, + k); + } +}; + template inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) @@ -969,6 +1219,21 @@ inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_i } } +template +inline void convolve_9x9_nhwc(const Window &window, unsigned int num_elems_read_per_iteration, + const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) +{ + const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + switch(conv_stride_x) + { + case 1: + convolver_9x9_nhwc::convolve(window, num_elems_read_per_iteration, input, weights, output, conv_info); + break; + default: + ARM_COMPUTE_ERROR("Not implemented"); + } +} + Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); @@ -1122,18 +1387,62 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } else { - border_size.left = 0; - border_size.top = conv_info.pad_left(); - border_size.right = 0; - border_size.bottom = conv_info.pad_right(); + if(kernel_size == 9) + { + border_size.left = 0; + border_size.top = conv_info.pad_left(); + + const int num_elems_read_per_iteration_x = 4; + const int num_elems_written_per_iteration_x = 1; + const int num_elems_read_per_iteration_y = 12; + const int num_elems_written_per_iteration_y = 4; + + num_elems_read_per_iteration = num_elems_read_per_iteration_x; + num_elems_written_per_iteration = num_elems_written_per_iteration_x; + + border_size.right = num_elems_read_per_iteration_x; + if((conv_info.pad_bottom() != 0) || (conv_info.pad_top() != 0)) + { + // If bottom or top padding are set, we need to read num_elems_read_per_iteration_y rows to zero. + // Since num_elems_read_per_iteration_y is always greater than conv_info.pad_right() we can set + // the bottom padding to num_elems_read_per_iteration_y + border_size.bottom = num_elems_read_per_iteration_y; + } + else if(conv_info.pad_right() != 0) + { + // Convetional border padding. Fill the bottom paddings so that we can read in batch of num_elems_read_per_iteration_y + border_size.bottom = ceil_to_multiple(input->dimension(1) + conv_info.pad_right(), num_elems_read_per_iteration_y) - input->dimension(1); + } + else + { + // No padding + border_size.bottom = 0; + } - num_elems_read_per_iteration = 16 / element_size_from_data_type(input->data_type()); + win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y)); - win = calculate_max_window(*output, Steps()); + AccessWindowStatic input_access(input, 0, -border_size.top, + ceil_to_multiple(input->dimension(0), num_elems_read_per_iteration_x), + input->dimension(1) + border_size.bottom); - AccessWindowRectangle input_access(input, 0, -border_size.top, num_elems_read_per_iteration, kernel_size, 1.f, conv_stride_x); - AccessWindowRectangle weights_access(weights, 0, 0, num_elems_read_per_iteration, kernel_size); - window_changed = update_window_and_padding(win, input_access, weights_access); + AccessWindowStatic weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_read_per_iteration_x), weights->dimension(1)); + AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y); + window_changed = update_window_and_padding(win, input_access, weights_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + else + { + border_size.left = 0; + border_size.top = conv_info.pad_left(); + border_size.right = 0; + border_size.bottom = conv_info.pad_right(); + num_elems_read_per_iteration = 16 / element_size_from_data_type(input->data_type()); + win = calculate_max_window(*output, Steps()); + + AccessWindowRectangle input_access(input, 0, -border_size.top, num_elems_read_per_iteration, kernel_size, 1.f, conv_stride_x); + AccessWindowRectangle weights_access(weights, 0, 0, num_elems_read_per_iteration, kernel_size); + window_changed = update_window_and_padding(win, input_access, weights_access); + } } Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; @@ -1268,7 +1577,6 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo } break; } - default: { ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported."); @@ -1278,11 +1586,25 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo } else { + const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_weights->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int stride_x = std::get<0>(_conv_info.stride()); + const int stride_y = std::get<1>(_conv_info.stride()); + switch(_input->info()->data_type()) { case DataType::F32: - convolver_nhwc::convolve(window, kernel_size, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info); + { + if(kernel_size == 9 && stride_x == 1 && stride_y == 1) + { + using vtype = wrapper::traits::neon_vector; + convolve_9x9_nhwc(window, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info); + } + else + { + convolver_nhwc::convolve(window, kernel_size, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info); + } break; + } default: ARM_COMPUTE_ERROR("Data type not supported"); break; -- cgit v1.2.1