From 4e2bbbbb23e6f4bd452f7f865e51228e1f51efec Mon Sep 17 00:00:00 2001 From: Pablo Marquez Tello Date: Mon, 9 Jan 2023 17:21:01 +0000 Subject: Add support for dilation > 1 in assembly DepthwiseConvolution * Resolve COMPMID-5689 Change-Id: I81a3791ad054db59562b76d1c729f2b2168aee8b Signed-off-by: Pablo Marquez Tello Signed-off-by: Andrew Mundy Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8919 Reviewed-by: Jakub Sujak Reviewed-by: Viet-Hoa Do Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../arm_conv/depthwise/depthwise_depthfirst.hpp | 59 ++++++++++++---------- 1 file changed, 31 insertions(+), 28 deletions(-) (limited to 'src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp') diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp index 70b12919b0..2620b48e17 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -367,6 +367,7 @@ class DepthwiseDepthfirst protected: void compute_tile_padded( + const DepthwiseArgs &args, unsigned int output_i, unsigned int output_j, unsigned int output_channel_start, unsigned int output_channel_end, const TensorSpec &input, @@ -379,13 +380,13 @@ class DepthwiseDepthfirst auto ws = reinterpret_cast(working_space_raw); // Compute the input pointer array - const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier; + const auto input_channel_start = output_channel_start / args.channel_multiplier; - const int ii = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const int ii = static_cast(output_i * args.stride_rows) - args.padding.top; const auto input_pad_top = static_cast(ii < 0 ? -ii : 0); const auto input_i = static_cast(ii < 0 ? 0 : ii); - const int ij = static_cast(output_j * this->m_args.stride_cols) - this->m_args.padding.left; + const int ij = static_cast(output_j * args.stride_cols) - args.padding.left; const auto input_pad_left = static_cast(ij < 0 ? -ij : 0); const auto input_j = static_cast(ij < 0 ? 0 : ij); @@ -394,8 +395,8 @@ class DepthwiseDepthfirst input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start, input.ld_row, input.ld_col, ws->input_buffer, - input_pad_top, this->m_args.input_rows - input_i, - input_pad_left, this->m_args.input_cols - input_j + input_pad_top, args.input_rows - input_i, + input_pad_left, args.input_cols - input_j ); // Compute the output pointer array @@ -404,8 +405,8 @@ class DepthwiseDepthfirst output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, output.ld_row, output.ld_col, ws->output_buffer, - 0, this->m_args.output_rows - output_i, // Top padding, # valid rows - 0, this->m_args.output_cols - output_j // Left padding, # valid columns + 0, args.output_rows - output_i, // Top padding, # valid rows + 0, args.output_cols - output_j // Left padding, # valid columns ); // Execute the kernel @@ -416,6 +417,7 @@ class DepthwiseDepthfirst } void compute_row_padded_tile_row( + const DepthwiseArgs &args, const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols, const unsigned int output_channel_start, const unsigned int output_channel_end, const TensorSpec &input, @@ -430,19 +432,19 @@ class DepthwiseDepthfirst const auto os = this->get_output_stage(); // Compute top and bottom padding; hence fill in the initial pointer arrays. - const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier; - const int ii = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const auto input_channel_start = output_channel_start / args.channel_multiplier; + const int ii = static_cast(output_i * args.stride_rows) - args.padding.top; const auto input_pad_top = static_cast(ii < 0 ? -ii : 0); const auto input_i = static_cast(ii < 0 ? 0 : ii); - const auto input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left; + const auto input_j = output_j * args.stride_cols - args.padding.left; // Valid input rows is the smallest of the input rows that aren't padding for this tile, and the number of rows // available. - const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, this->m_args.input_rows - input_i); - const auto valid_output_rows = std::min(strat->get_output_rows(), this->m_args.output_rows - output_i); + const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, args.input_rows - input_i); + const auto valid_output_rows = std::min(strat->get_output_rows(), args.output_rows - output_i); - const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols; + const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols; const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols(); fill_pointer_array( @@ -450,8 +452,8 @@ class DepthwiseDepthfirst input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start, input.ld_row, input.ld_col, ws->input_buffer, - input_pad_top, this->m_args.input_rows - input_i, - 0, this->m_args.input_cols - input_j // No left padding + input_pad_top, args.input_rows - input_i, + 0, args.input_cols - input_j // No left padding ); fill_pointer_array( @@ -459,8 +461,8 @@ class DepthwiseDepthfirst output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, output.ld_row, output.ld_col, ws->output_buffer, - 0, this->m_args.output_rows - output_i, // Top padding, # valid rows - 0, this->m_args.output_cols - output_j // Left padding, # valid columns + 0, args.output_rows - output_i, // Top padding, # valid rows + 0, args.output_cols - output_j // Left padding, # valid columns ); for (; n_tile_cols; n_tile_cols--) @@ -492,6 +494,7 @@ class DepthwiseDepthfirst } void compute_tiles_unpadded( + const DepthwiseArgs &args, unsigned int output_i, const unsigned int output_j, unsigned int n_tile_rows, unsigned int n_tile_cols, unsigned int output_channel_start, unsigned int output_channel_end, @@ -511,8 +514,8 @@ class DepthwiseDepthfirst // If the direct kernel is supported, then use it. // Compute the base pointers we'll use in the tile. auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col; - const int start_input_i = output_i * this->m_args.stride_rows - this->m_args.padding.top; - const int start_input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left; + const int start_input_i = output_i * args.stride_rows - args.padding.top; + const int start_input_j = output_j * args.stride_cols - args.padding.left; auto inptr = input.base + output_channel_start + start_input_i * input.ld_row + start_input_j * input.ld_col; // Execute the kernel @@ -528,10 +531,10 @@ class DepthwiseDepthfirst { // Otherwise, we repeatedly call the padded kernel but use our knowledge // of the tensor structure to avoid recomputing the pointer array. - const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier; + const auto input_channel_start = output_channel_start / args.channel_multiplier; const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols(); - const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols; + const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols; const auto n_output_pointers = this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols(); @@ -539,16 +542,16 @@ class DepthwiseDepthfirst // each subsequent tile we simply update the pointers. for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++) { - const int input_i = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; - const int input_j = static_cast(output_j * this->m_args.stride_cols) - this->m_args.padding.left; + const int input_i = static_cast(output_i * args.stride_rows) - args.padding.top; + const int input_j = static_cast(output_j * args.stride_cols) - args.padding.left; fill_pointer_array( ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(), input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start, input.ld_row, input.ld_col, ws->input_buffer, - 0, this->m_args.input_rows, - 0, this->m_args.input_cols + 0, args.input_rows, + 0, args.input_cols ); // Compute the output pointer array @@ -557,8 +560,8 @@ class DepthwiseDepthfirst output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, output.ld_row, output.ld_col, ws->output_buffer, - 0, this->m_args.output_rows, - 0, this->m_args.output_cols + 0, args.output_rows, + 0, args.output_cols ); for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++) -- cgit v1.2.1