From be0ae93c50bfa3e588111585025278daa8cb0694 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 13 Mar 2018 13:08:12 +0000 Subject: COMPMID-1005: Update Depthwise Convolution form RSH Change-Id: I3033ddb8de183661010d6c71a83f71132037b139 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/124338 Tested-by: Jenkins Reviewed-by: Pablo Tello --- .../kernels/NEDepthwiseConvolutionLayer3x3Kernel.h | 20 +- .../NEON/kernels/convolution/common/profiler.hpp | 31 ++- .../kernels/convolution/common/tensor_utils.hpp | 9 +- .../core/NEON/kernels/convolution/common/utils.hpp | 1 - .../kernels/convolution/depthwise/depthwise.hpp | 176 +++++++++--- .../kernels/convolution/depthwise/impl_base.hpp | 297 ++++++++++++++------- .../convolution/depthwise/impl_fp32_fp32.hpp | 69 +++-- 7 files changed, 430 insertions(+), 173 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h index 1367f378f7..5871cc5dcb 100644 --- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h @@ -82,8 +82,24 @@ private: void configure_optimized(); void run_generic(const Window &window, const ThreadInfo &info); void run_optimized(const Window &window, const ThreadInfo &info); - std::unique_ptr create_convolver_object(TensorShape shape, PadStrideInfo conv_info, - const uint8_t *w_ptr, uint8_t *in_ptr, uint8_t *out_ptr); + /** Creates an optimized backend convolver object + * + * @note Convolver of strides 1,2 and convolution size of 3 is currently supported + * + * @param[in] conv_info Padding and stride information to use for the convolution + * @param[in] w Weights tensor + * @param[in] in Input tensor + * @param[in] out Output tensor + * @param[in] setup_strides (Optional) Boolean to enable setting the strides of the tensors + * in the convolver in case of padding. Defaults to false + * + * @return A convolver object or nullptr if the configuration is not supported + */ + std::unique_ptr create_convolver_object(PadStrideInfo conv_info, + const ITensor *w, + const ITensor *in, + ITensor *out, + bool setup_strides = false); private: BorderSize _border_size; diff --git a/arm_compute/core/NEON/kernels/convolution/common/profiler.hpp b/arm_compute/core/NEON/kernels/convolution/common/profiler.hpp index 01fafa9604..c6897e3771 100644 --- a/arm_compute/core/NEON/kernels/convolution/common/profiler.hpp +++ b/arm_compute/core/NEON/kernels/convolution/common/profiler.hpp @@ -24,17 +24,21 @@ #pragma once +#include +#include +#include +#include + +#ifdef CYCLE_PROFILING #include #include -#include -#include #include #include #include #include #include "perf.h" -#include +#endif // CYCLE_PROFILING #ifdef CYCLE_PROFILING class EventIDContainer @@ -295,32 +299,43 @@ public: #endif // CYCLE_PROFILING template - void operator() (const char * event, - T func, - long int bytes_read = 0, - long int ops = 0, - long int bytes_written = 0) { + double operator() (const char * event, + T func, + long int bytes_read = 0, + long int ops = 0, + long int bytes_written = 0) { #ifdef CYCLE_PROFILING if (currentevent==maxevents) { + const std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); func(); + const std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(end - start).count(); } else { const auto countfd = thread_counter_fds.get_counter_fd(); + const std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); start_counter(countfd); func(); long long cycs = stop_counter(countfd); + const std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(end - start).count(); // Store the profiling data std::lock_guard lock_events(event_lock); events[currentevent++] = { get_event_id(event), bytes_read, ops, bytes_written, cycs }; + + return duration_us; } #else (void) event; (void) bytes_read; (void) ops; (void) bytes_written; + const std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); func(); + const std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(end - start).count(); #endif // CYCLE_PROFILING } }; diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp b/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp index 68a5c6a178..0c234431b1 100644 --- a/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp +++ b/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp @@ -30,9 +30,12 @@ void PrintTensor(const Tensor4D& tensor); void PrintWeights(const Tensor4D& weights); // Test the equivalence of two tensors -bool CmpTensors(const Tensor4D& a, - const Tensor4D& b, - const float max_delta=0.0f); +// Counts the instances that |a - b|/|a| > max_err +bool CmpTensors( + const Tensor4D& a, + const Tensor4D& b, + const float max_err=0.0f +); // Fill the tensor with a test pattern void TestPattern(Tensor4D& tensor); diff --git a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp b/arm_compute/core/NEON/kernels/convolution/common/utils.hpp index a22809fb58..5f42719119 100644 --- a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp +++ b/arm_compute/core/NEON/kernels/convolution/common/utils.hpp @@ -24,7 +24,6 @@ #pragma once -double TimeInUs(void); void PrintMatrix(const float *const m, const int M, const int N, const int row_stride); inline int iceildiv(const int a, const int b) diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp index 80b0614015..4ca68116db 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp @@ -29,7 +29,7 @@ namespace depthwise class IDepthwiseConvolution { -public: + public: virtual ~IDepthwiseConvolution() = default; virtual int output_size(const int dim_size, const bool padding_same) const = 0; virtual unsigned int get_window(void) const = 0; @@ -59,8 +59,8 @@ class DepthwiseConvolution : public IDepthwiseConvolution static constexpr int kernel_cols = KernelCols; static constexpr int stride_rows = StrideRows; static constexpr int stride_cols = StrideCols; - static constexpr int inner_tile_rows = stride_rows * output_tile_rows + kernel_rows - 1; - static constexpr int inner_tile_cols = stride_cols * output_tile_cols + kernel_cols - 1; + static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows; + static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols; /** Create a new depthwise convolution engine. * @@ -79,6 +79,75 @@ class DepthwiseConvolution : public IDepthwiseConvolution const TIn* const weights, const TIn* const input, TOut* const output + ) : DepthwiseConvolution( + n_batches, n_input_rows, n_input_cols, n_channels, padding_same, + weights, input, output, 0 /* column stride = default */ + ) + { + } + + /** Create a new depthwise convolution engine with a specified column stride. + * + * @param[in] n_batches Number of batches tensors. + * @param[in] n_input_rows Number of rows in input tensor. + * @param[in] n_input_cols Number of columns in input tensor. + * @param[in] n_channels Number of channels in input and output tensors. + * @param[in] padding_same True if padding is SAME, else VALID. + * @param[in] weights Pointer to Height x Width x Channel ordered weights. + * @param[in] input Pointer to NHWC ordered input tensor. + * @param[output] output Pointer to NHWC ordered output tensor. + * @param[in] col_stride Stride between columns of the weights, inputs and output tensors. + */ + DepthwiseConvolution( + const int n_batches, const int n_input_rows, const int n_input_cols, + const int n_channels, const bool padding_same, + const TIn* const weights, + const TIn* const input, + TOut* const output, + const int col_stride + ) : DepthwiseConvolution( + n_batches, n_input_rows, n_input_cols, n_channels, padding_same, + weights, input, output, + col_stride, 0, /* Weight row stride = default */ + col_stride, 0, 0, /* Input row stride, batch stride = default */ + col_stride, 0, 0 /* Output row stride, batch stride = default */ + ) + { + } + + /** Create a new depthwise convolution engine. + * + * @param[in] n_batches Number of batches tensors. + * @param[in] n_input_rows Number of rows in input tensor. + * @param[in] n_input_cols Number of columns in input tensor. + * @param[in] n_channels Number of channels in input and output tensors. + * @param[in] padding_same True if padding is SAME, else VALID. + * @param[in] weights Pointer to Height x Width x Channel ordered weights. + * @param[in] input Pointer to NHWC ordered input tensor. + * @param[output] output Pointer to NHWC ordered output tensor. + * @param[in] weight_col_stride Stride between columns of the weights (if 0, defaults appropriately). + * @param[in] weight_row_stride Stride between rows of the weights (if 0, defaults appropriately). + * @param[in] input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately). + * @param[in] input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately). + * @param[in] input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately). + * @param[in] output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately). + * @param[in] output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately). + * @param[in] output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately). + */ + DepthwiseConvolution( + const int n_batches, const int n_input_rows, const int n_input_cols, + const int n_channels, const bool padding_same, + const TIn* const weights, + const TIn* const input, + TOut* const output, + const int weight_col_stride, + const int weight_row_stride, + const int input_col_stride, + const int input_row_stride, + const int input_batch_stride, + const int output_col_stride, + const int output_row_stride, + const int output_batch_stride ); // Cannot copy or move a DepthwiseConvolution. @@ -99,14 +168,15 @@ class DepthwiseConvolution : public IDepthwiseConvolution */ int output_size(const int dim_size, const bool padding_same) const override { - return DepthwiseConvolution::get_output_size(dim_size, padding_same); + return DepthwiseConvolution< + OutputTileRows, + OutputTileCols, + KernelRows, + KernelCols, + StrideRows, + StrideCols, + TIn, TOut + >::get_output_size(dim_size, padding_same); } /** Get the window of work to be performed by an instance of the operator. @@ -128,6 +198,8 @@ class DepthwiseConvolution : public IDepthwiseConvolution static void process_tile_row( const int n_channels, const TIn* const weights, + const int weight_row_stride, + const int weight_col_stride, const TIn* const inptr, const int in_row_stride, const int in_col_stride, @@ -143,7 +215,27 @@ class DepthwiseConvolution : public IDepthwiseConvolution const int n_output_cols ); - /** Process a single tile of the tensors. + // Determine the maximum (and minimum) padding values which can be applied + // to tiles of the tensors involved in this class of convolution. + static constexpr int max_in_pad_top = (kernel_rows - 1) / 2; + static constexpr int min_in_pad_top = (kernel_rows - stride_rows) / 2; + + static constexpr int max_in_pad_left = (kernel_cols - 1) / 2; + static constexpr int min_in_pad_left = (kernel_cols - stride_cols) / 2; + + static constexpr int max_in_pad_bottom = inner_tile_rows; + static constexpr int max_in_pad_right = inner_tile_cols; + static constexpr int max_out_pad_bottom = output_tile_rows; + static constexpr int max_out_pad_right = output_tile_cols; + + static constexpr int n_in_pad_top_fns = (max_in_pad_top - min_in_pad_top) + 1; + static constexpr int n_in_pad_left_fns = (max_in_pad_left - min_in_pad_left) + 1; + static constexpr int n_in_pad_bottom_fns = max_in_pad_bottom + 1; + static constexpr int n_in_pad_right_fns = max_in_pad_right + 1; + static constexpr int n_out_pad_bottom_fns = max_out_pad_bottom + 1; + static constexpr int n_out_pad_right_fns = max_out_pad_right + 1; + + /** Pointer to a function which will process a tile. * * @param[in] n_channels Number of channels. * @param[in] weights Pointer to Height x Width x Channels ordered weights. @@ -153,48 +245,47 @@ class DepthwiseConvolution : public IDepthwiseConvolution * @param[out] outptr Pointer to the top-left output value for the tile. * @param[in] out_row_stride Stride between rows of the output tensor. * @param[in] out_col_stride Stride between columns of the output tensor. + * + * The following parameters may be ignored if the function has been + * specialised for specific padding constraints. + * + * @param[in] _in_pad_top Padding to apply to top of input tile. + * @param[in] _in_pad_left Padding to apply to left of input tile. + * @param[in] _in_pad_bottom Padding to apply to bottom of input tile. + * @param[in] _in_pad_right Padding to apply to right of input tile. + * @param[in] _out_pad_bottom Null cells at bottom of output tile. + * @param[in] _out_pad_right Null cells at right of output tile. */ - template < - int in_pad_top, int in_pad_left, int in_pad_bottom, int in_pad_right, - int out_pad_bottom, int out_pad_right - > - static void process_tile( + typedef void (*TileFn)( const int n_channels, const TIn* const weights, + const int weight_row_stride, + const int weight_col_stride, const TIn* const inptr, const int in_row_stride, const int in_col_stride, TOut* const outptr, const int out_row_stride, - const int out_col_stride - ); - - // Type of a pointer to a `process_tile` instance - typedef void (*TileFn)( - const int, - const TIn* const, - const TIn* const, const int, const int, - TOut* const, const int, const int + const int out_col_stride, + const int _in_pad_top, + const int _in_pad_left, + const int _in_pad_bottom, + const int _in_pad_right, + const int _out_pad_bottom, + const int _out_pad_right ); - // Determine the maximum padding values which can be applied to tiles of - // the tensors involved in this class of convolution. - static constexpr int max_in_pad_top = 2; - static constexpr int max_in_pad_left = 2; - static constexpr int max_in_pad_bottom = inner_tile_rows - 1; - static constexpr int max_in_pad_right = inner_tile_cols - 1; - static constexpr int max_out_pad_bottom = output_tile_rows; - static constexpr int max_out_pad_right = output_tile_cols; - - /** Array of methods to process tensor tiles. + /* Arrays of methods to process tensor tiles. * * Allows dynamic dispatch to specialized implementations based on * different padding configurations. */ - static const TileFn tile_fns[ - max_in_pad_top][max_in_pad_left][max_in_pad_bottom][max_in_pad_right][ - max_out_pad_bottom][max_out_pad_right - ]; + static const TileFn tilefn_unpadded; + static const TileFn tilefn_top[n_in_pad_top_fns]; + static const TileFn tilefn_left[n_in_pad_left_fns]; + static const TileFn tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns]; + static const TileFn tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns]; + static const TileFn tilefn_generic; private: // Member variables of instances of a convolution engine. @@ -204,6 +295,11 @@ class DepthwiseConvolution : public IDepthwiseConvolution const int _n_batches, _n_input_rows, _n_input_cols, _n_channels, _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols; const bool _padding_same; + + // Stride information for a convolution instance + const int _weight_col_stride, _weight_row_stride; + const int _input_col_stride, _input_row_stride, _input_batch_stride; + const int _output_col_stride, _output_row_stride, _output_batch_stride; }; } // namespace depthwise diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp index f9671fc426..17889849db 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp @@ -39,6 +39,8 @@ namespace depthwise { +const unsigned int CHANNEL_BLOCK = 16; + template int DepthwiseConvolution::get_output_size( const int dim_size, const bool same_padding @@ -54,7 +56,15 @@ DepthwiseConvolution::DepthwiseConvolution( const int n_channels, const bool padding_same, const TIn* const weights, const TIn* const input, - TOut* const output + TOut* const output, + const int weight_col_stride, + const int weight_row_stride, + const int input_col_stride, + const int input_row_stride, + const int input_batch_stride, + const int output_col_stride, + const int output_row_stride, + const int output_batch_stride ) : _weights(weights), _input(input), _output(output), _n_batches(n_batches), _n_input_rows(n_input_rows), @@ -64,7 +74,15 @@ DepthwiseConvolution::DepthwiseConvolution( _n_output_cols(get_output_size(n_input_cols, padding_same)), _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)), _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)), - _padding_same(padding_same) + _padding_same(padding_same), + _weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels), + _weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride), + _input_col_stride(input_col_stride ? input_col_stride : _n_channels), + _input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride), + _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride), + _output_col_stride(output_col_stride ? output_col_stride : _n_channels), + _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride), + _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride) { } @@ -72,8 +90,8 @@ DepthwiseConvolution::DepthwiseConvolution( template unsigned int DepthwiseConvolution::get_window() const { - // TODO Later support parallelisation over tile rows. - return 1; // _n_tile_rows; + // Parallelise over blocks of channels. + return iceildiv(_n_channels, CHANNEL_BLOCK); } @@ -83,41 +101,31 @@ void DepthwiseConvolution::run( const unsigned int stop ) { - // TODO Later support parallelisation over tile rows. - (void) start; - (void) stop; - - // Compute input striding - const int input_col_stride = _n_channels; - const int input_row_stride = _n_input_cols * input_col_stride; - const int input_batch_stride = _n_input_rows * input_row_stride; - - // Compute output striding - const int output_col_stride = _n_channels; - const int output_row_stride = _n_output_cols * output_col_stride; - const int output_batch_stride = _n_output_rows * output_row_stride; + // Parallelise over blocks of channels + const auto start_channel = CHANNEL_BLOCK * start; + const auto stop_channel = std::min(_n_channels, CHANNEL_BLOCK * stop); // Compute top and bottom padding for input and output const int input_pad_top = _padding_same ? - ((_n_output_rows - 1)*stride_rows + kernel_rows - _n_input_rows) / 2 : 0; + ((_n_output_rows - 1)*stride_rows + kernel_rows - _n_input_rows) / 2 : 0; const int input_pad_left = _padding_same ? - ((_n_output_cols - 1)*stride_cols + kernel_cols - _n_input_cols) / 2 : 0; - constexpr int tile_overlap = kernel_rows - 1; + ((_n_output_cols - 1)*stride_cols + kernel_cols - _n_input_cols) / 2 : 0; + constexpr int tile_overlap = kernel_rows - stride_rows; // Perform the convolution by calling `process_tile_row` for each tile row in // each batch. for (int batch = 0; batch < _n_batches; batch++) { - const TIn* const inptr_batch = _input + batch*input_batch_stride; - TOut* const outptr_batch = _output + batch*output_batch_stride; + const TIn* const inptr_batch = _input + batch*_input_batch_stride; + TOut* const outptr_batch = _output + batch*_output_batch_stride; // Loop over rows of tiles for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++) { // Pointer to the row const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top; - const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*input_row_stride); - TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * output_row_stride; + const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride); + TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride; // Input padding (top + bottom) for the row const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top; @@ -131,9 +139,10 @@ void DepthwiseConvolution::run( // Process the row process_tile_row( - _n_channels, _weights, - inptr_row, input_row_stride, input_col_stride, - outptr_row, output_row_stride, output_col_stride, + stop_channel - start_channel, + _weights + start_channel, _weight_row_stride, _weight_col_stride, + inptr_row + start_channel, _input_row_stride, _input_col_stride, + outptr_row + start_channel, _output_row_stride, _output_col_stride, input_row_pad_top, input_pad_left, input_row_pad_bottom, output_row_pad_bottom, _n_tile_cols, _n_input_cols, _n_output_cols @@ -147,6 +156,8 @@ template ::process_tile_row( const int n_channels, const TIn* const weights, + const int weight_row_stride, + const int weight_col_stride, const TIn* const inptr, const int in_row_stride, const int in_col_stride, @@ -162,7 +173,7 @@ void DepthwiseConvolution::process_tile_row const int n_output_cols ) { - constexpr int tile_overlap = kernel_cols - 1; + constexpr int tile_overlap = kernel_cols - stride_cols; // Loop over columns of tiles for (int tile_j = 0; tile_j < n_tiles; tile_j++) @@ -183,44 +194,182 @@ void DepthwiseConvolution::process_tile_row TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride; // Apply the specific tile processing function - tile_fns[row_pad_in_top][t_pad_in_left][row_pad_in_bottom][t_pad_in_right][row_pad_out_bottom][t_pad_out_right]( - n_channels, weights, + const bool pad_top = row_pad_in_top > 0; + const bool pad_left = t_pad_in_left > 0; + const bool pad_bottom = row_pad_in_bottom || row_pad_out_bottom; + const bool pad_right = t_pad_in_right || t_pad_out_right; + + const TileFn tilefn = [&] () { + if (!pad_top && !pad_left && !pad_bottom && !pad_right) + { + // No padding + return tilefn_unpadded; + } + else if (pad_top && !pad_left && !pad_bottom && !pad_right) + { + // Padding on the top only, subtract off the minimum expected padding in + // order to index into the array of specialised methods. + const int index = row_pad_in_top - min_in_pad_top; + return tilefn_top[index]; + } + else if (!pad_top && pad_left && !pad_bottom && !pad_right) + { + // Padding on the left only, subtract off the minimum expected padding in + // order to index into the array of specialised methods. + const int index = t_pad_in_left - min_in_pad_left; + return tilefn_left[index]; + } + else if (!pad_top && !pad_left && pad_bottom && !pad_right) + { + // Padding on the bottom only + return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom]; + } + else if (!pad_top && !pad_left && !pad_bottom && pad_right) + { + // Padding on the right only + return tilefn_right[t_pad_in_right][t_pad_out_right]; + } + else + { + // Otherwise use generic tile processing method. + return tilefn_generic; + } + }(); + + tilefn( + n_channels, + weights, weight_row_stride, weight_col_stride, inptr_col, in_row_stride, in_col_stride, - outptr_col, out_row_stride, out_col_stride + outptr_col, out_row_stride, out_col_stride, + row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, + row_pad_out_bottom, t_pad_out_right ); } } +// New templated struct used solely as a way to provide tile processing +// specialisations. +template +struct DepthwiseConvolutionImpl : public DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, TIn, TOut +> +{ + typedef DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + TIn, TOut + > DWC; + + /** Perform the depthwise convolution of a tile. + * + * @param[in] n_channels Number of channels. + * @param[in] weights Pointer to Height x Width x Channels ordered weights. + * @param[in] inptr Pointer to the top-left unpadded value of the tile. + * @param[in] in_row_stride Stride between rows of the input tensor. + * @param[in] in_col_stride Stride between columns of the input tensor. + * @param[out] outptr Pointer to the top-left output value for the tile. + * @param[in] out_row_stride Stride between rows of the output tensor. + * @param[in] out_col_stride Stride between columns of the output tensor. + * + * The following parameters may be ignored if the function has been + * specialised for specific padding constraints. + * + * @param[in] _in_pad_top Padding to apply to top of input tile. + * @param[in] _in_pad_left Padding to apply to left of input tile. + * @param[in] _in_pad_bottom Padding to apply to bottom of input tile. + * @param[in] _in_pad_right Padding to apply to right of input tile. + * @param[in] _out_pad_bottom Null cells at bottom of output tile. + * @param[in] _out_pad_right Null cells at right of output tile. + */ + template < + bool Specialize=false, // Specialize (or not) the method + int InPadTop=0, // If specialized, top padding + int InPadLeft=0, // If specialized, left padding + int InPadBottom=0, // If specialized, bottom padding + int InPadRight=0, // If specialized, right padding + int OutPadBottom=0, // If specialized, bottom output padding + int OutPadRight=0 // If specialized, bottom right padding + > + static void process_tile( + const int n_channels, + const TIn* const weights, + const int weight_row_stride, + const int weight_col_stride, + const TIn* const inptr, + const int in_row_stride, + const int in_col_stride, + TOut* const outptr, + const int out_row_stride, + const int out_col_stride, + const int in_pad_top=0, + const int in_pad_left=0, + const int in_pad_bottom=0, + const int in_pad_right=0, + const int out_pad_bottom=0, + const int out_pad_right=0 + ); +}; + + template template < - int in_pad_top, int in_pad_left, int in_pad_bottom, int in_pad_right, - int out_pad_bottom, int out_pad_right + bool Specialize, + int InPadTop, int InPadLeft, int InPadBottom, int InPadRight, + int OutPadBottom, int OutPadRight > -void DepthwiseConvolution::process_tile( +void DepthwiseConvolutionImpl::process_tile( const int n_channels, - const TIn* const weights, - const TIn* const inptr, + const TIn *__restrict__ const weights, + const int weight_row_stride, + const int weight_col_stride, + const TIn *__restrict__ const inptr, const int in_row_stride, const int in_col_stride, - TOut* const outptr, + TOut *__restrict__ const outptr, const int out_row_stride, - const int out_col_stride + const int out_col_stride, + const int _in_pad_top, + const int _in_pad_left, + const int _in_pad_bottom, + const int _in_pad_right, + const int _out_pad_bottom, + const int _out_pad_right ) { + constexpr auto inner_tile_rows = DWC::inner_tile_rows; + constexpr auto inner_tile_cols = DWC::inner_tile_cols; + constexpr auto kernel_rows = DWC::kernel_rows; + constexpr auto kernel_cols = DWC::kernel_cols; + constexpr auto output_tile_rows = DWC::output_tile_rows; + constexpr auto output_tile_cols = DWC::output_tile_cols; + constexpr auto stride_rows = DWC::stride_rows; + constexpr auto stride_cols = DWC::stride_cols; + + // Extract parameters + const int in_pad_top = Specialize ? InPadTop : _in_pad_top; + const int in_pad_left = Specialize ? InPadLeft : _in_pad_left; + const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom; + const int in_pad_right = Specialize ? InPadRight : _in_pad_right; + const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom; + const int out_pad_right = Specialize ? OutPadRight : _out_pad_right; + // Compute valid ranges of the tile - constexpr int in_cells_i = inner_tile_rows - in_pad_bottom; - constexpr int in_cells_j = inner_tile_cols - in_pad_right; - constexpr int out_cells_i = output_tile_rows - out_pad_bottom; - constexpr int out_cells_j = output_tile_cols - out_pad_right; + const int in_cells_i = inner_tile_rows - in_pad_bottom; + const int in_cells_j = inner_tile_cols - in_pad_right; + const int out_cells_i = output_tile_rows - out_pad_bottom; + const int out_cells_j = output_tile_cols - out_pad_right; // Instantiate pointers - const TIn* inptr_base = inptr; - const TIn* wptr_base = weights; - TOut* outptr_base = outptr; - - const int weight_col_stride = n_channels; - const int weight_row_stride = kernel_cols * n_channels; + const TIn* __restrict__ inptr_base = inptr; + const TIn* __restrict__ wptr_base = weights; + TOut* __restrict__ outptr_base = outptr; // Perform the depthwise convolution int channels_remaining = n_channels; @@ -259,7 +408,7 @@ void DepthwiseConvolution::process_tile( wptr_base++; // Perform the convolution - TOut v[out_cells_i][out_cells_j]; + TOut v[output_tile_rows][output_tile_cols]; for (int out_i = 0; out_i < out_cells_i; out_i++) { for (int out_j = 0; out_j < out_cells_j; out_j++) @@ -287,7 +436,7 @@ void DepthwiseConvolution::process_tile( // Store the output tile for (int i = 0; i < out_cells_i; i++) { - TOut* const outptr_row = outptr_base + i*out_row_stride; + TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride; for (int j = 0; j < out_cells_j; j++) { *(outptr_row + j*out_col_stride) = v[i][j]; @@ -297,52 +446,4 @@ void DepthwiseConvolution::process_tile( } } - -// New templated struct used solely as a way to provide tile processing -// specialisations. -template -struct DepthwiseConvolutionImpl : public DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, TIn, TOut -> -{ - template < - int in_pad_top, int in_pad_left, int in_pad_bottom, int in_pad_right, - int out_pad_bottom, int out_pad_right - > - static void process_tile( - const int n_channels, - const TIn* const weights, - const TIn* const inptr, - const int in_row_stride, - const int in_col_stride, - TOut* const outptr, - const int out_row_stride, - const int out_col_stride - ) - { - // By default, redirect to parent. Specialised implementations can be added - // by overriding this method. - DepthwiseConvolution:: - template process_tile( - n_channels, - weights, - inptr, - in_row_stride, - in_col_stride, - outptr, - out_row_stride, - out_col_stride - ); - } -}; - } // namespace depthwise diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp index e7f0609b0c..7a216ed518 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp @@ -51,36 +51,58 @@ struct DepthwiseConvolutionImpl DWC; template < - int in_pad_top, int in_pad_left, int in_pad_bottom, int in_pad_right, - int out_pad_bottom, int out_pad_right + bool Specialize=false, // Specialize (or not) the method + int InPadTop=0, // If specialized, top padding + int InPadLeft=0, // If specialized, left padding + int InPadBottom=0, // If specialized, bottom padding + int InPadRight=0, // If specialized, right padding + int OutPadBottom=0, // If specialized, bottom output padding + int OutPadRight=0 // If specialized, bottom right padding > static void process_tile( const int n_channels, const float* const weights, + const int weight_row_stride, + const int weight_col_stride, const float* const inptr, const int in_row_stride, const int in_col_stride, float* const outptr, const int out_row_stride, - const int out_col_stride + const int out_col_stride, + const int in_pad_top=0, + const int in_pad_left=0, + const int in_pad_bottom=0, + const int in_pad_right=0, + const int out_pad_bottom=0, + const int out_pad_right=0 ); }; template template < - int in_pad_top, int in_pad_left, int in_pad_bottom, int in_pad_right, - int out_pad_bottom, int out_pad_right + bool Specialize, + int InPadTop, int InPadLeft, int InPadBottom, int InPadRight, + int OutPadBottom, int OutPadRight > void DepthwiseConvolutionImpl::process_tile( const int n_channels, - const float* const weights, - const float* const inptr, + const float *__restrict__ const weights, + const int weight_row_stride, + const int weight_col_stride, + const float *__restrict__ const inptr, const int in_row_stride, const int in_col_stride, - float* const outptr, + float *__restrict__ const outptr, const int out_row_stride, - const int out_col_stride + const int out_col_stride, + const int _in_pad_top, + const int _in_pad_left, + const int _in_pad_bottom, + const int _in_pad_right, + const int _out_pad_bottom, + const int _out_pad_right ) { constexpr auto inner_tile_rows = DWC::inner_tile_rows; @@ -92,19 +114,24 @@ void DepthwiseConvolutionImpl::process_t constexpr auto stride_rows = DWC::stride_rows; constexpr auto stride_cols = DWC::stride_cols; + // Extract parameters + const int in_pad_top = Specialize ? InPadTop : _in_pad_top; + const int in_pad_left = Specialize ? InPadLeft : _in_pad_left; + const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom; + const int in_pad_right = Specialize ? InPadRight : _in_pad_right; + const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom; + const int out_pad_right = Specialize ? OutPadRight : _out_pad_right; + // Compute valid ranges of the tile - constexpr int in_cells_i = inner_tile_rows - in_pad_bottom; - constexpr int in_cells_j = inner_tile_cols - in_pad_right; - constexpr int out_cells_i = output_tile_rows - out_pad_bottom; - constexpr int out_cells_j = output_tile_cols - out_pad_right; + const int in_cells_i = inner_tile_rows - in_pad_bottom; + const int in_cells_j = inner_tile_cols - in_pad_right; + const int out_cells_i = output_tile_rows - out_pad_bottom; + const int out_cells_j = output_tile_cols - out_pad_right; // Instantiate pointers - const float* inptr_base = inptr; - const float* wptr_base = weights; - float* outptr_base = outptr; - - const int weight_col_stride = n_channels; - const int weight_row_stride = kernel_cols * n_channels; + const float* __restrict__ inptr_base = inptr; + const float* __restrict__ wptr_base = weights; + float* __restrict__ outptr_base = outptr; // Perform the depthwise convolution int channels_remaining = n_channels; @@ -144,7 +171,7 @@ void DepthwiseConvolutionImpl::process_t wptr_base += 4; // Perform the convolution - float32x4_t v[out_cells_i][out_cells_j]; + float32x4_t v[output_tile_rows][output_tile_cols]; for (int out_i = 0; out_i < out_cells_i; out_i++) { for (int out_j = 0; out_j < out_cells_j; out_j++) @@ -222,7 +249,7 @@ void DepthwiseConvolutionImpl::process_t wptr_base++; // Perform the convolution - float v[out_cells_i][out_cells_j]; + float v[output_tile_rows][output_tile_cols]; for (int out_i = 0; out_i < out_cells_i; out_i++) { for (int out_j = 0; out_j < out_cells_j; out_j++) -- cgit v1.2.1