From 47d39dc615d1dee2482bc84699802165a9778ac8 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 11 Mar 2019 14:03:23 +0000 Subject: COMPMID-1975: Update depthwise convolution. Change-Id: Iad58672be35710a7ec2e918653d6d529709387e8 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/898 Tested-by: Arm Jenkins Reviewed-by: Giuseppe Rossini Comments-Addressed: Arm Jenkins Reviewed-by: Gian Marco Iodice --- .../kernels/NEDepthwiseConvolutionLayer3x3Kernel.h | 64 +- .../NEDepthwiseConvolutionAssemblyKernelWrapper.h | 88 +++ .../NEON/kernels/convolution/common/activation.hpp | 37 ++ .../NEON/kernels/convolution/common/padding.hpp | 74 +++ .../NEON/kernels/convolution/common/qasymm8.hpp | 54 ++ .../NEON/kernels/convolution/common/tensor.hpp | 69 +- .../kernels/convolution/depthwise/depthwise.hpp | 702 +++++++++++---------- .../convolution/depthwise/depthwise_quantized.hpp | 118 ++++ .../kernels/convolution/depthwise/impl_base.hpp | 628 ++++++++---------- arm_compute/core/utils/misc/InfoHelpers.h | 62 ++ .../NEON/functions/NEDepthwiseConvolutionLayer.h | 52 +- .../NEDepthwiseConvolutionAssemblyDispatch.h | 117 ++++ 12 files changed, 1281 insertions(+), 784 deletions(-) create mode 100644 arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h create mode 100644 arm_compute/core/NEON/kernels/convolution/common/activation.hpp create mode 100644 arm_compute/core/NEON/kernels/convolution/common/padding.hpp create mode 100644 arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp create mode 100644 arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp create mode 100644 arm_compute/core/utils/misc/InfoHelpers.h create mode 100644 arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h (limited to 'arm_compute') diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h index 64f10b4bd1..87ca4da05b 100644 --- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -25,12 +25,10 @@ #define __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ #include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp" - -#include namespace arm_compute { +// Forward declarations class ITensor; /** Interface for the kernel to run a 3x3 depthwise convolution on a tensor. */ @@ -60,23 +58,8 @@ public: * @param[out] output Destination tensor. Data type supported: Same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] data_layout (Optional) Data layout of the input and weights tensor */ - void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, DataLayout data_layout = DataLayout::NCHW); - /** Static method that checks if optimized execution is supported for the given parameters - * - * @param[in] input_shape Input shape - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] dt Data type of the input and weights - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] data_layout (Optional) Data layout of the input and weights tensor - * - * @return True if the optimized kernels can be executed else false - */ - static bool is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, unsigned int depth_multiplier = 1, DataLayout data_layout = DataLayout::NCHW); - /** Generates the convolver object */ - void generate_convolver(); - + void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3Kernel * * @note Supported data layouts: NCHW and NHWC @@ -96,40 +79,13 @@ public: BorderSize border_size() const override; private: - void configure_generic(); - void configure_optimized(); - - void run_generic(const Window &window, const ThreadInfo &info); - void run_optimized(const Window &window, const ThreadInfo &info); - /** Creates an optimized backend convolver object - * - * @note Convolver of strides 1,2 and convolution size of 3 is currently supported - * - * @param[in] conv_info Padding and stride information to use for the convolution - * @param[in] w Weights tensor - * @param[in] in Input tensor - * @param[in] out Output tensor - * @param[in] setup_strides (Optional) Boolean to enable setting the strides of the tensors - * in the convolver in case of padding. Defaults to false - * - * @return A convolver object or nullptr if the configuration is not supported - */ - std::unique_ptr create_convolver_object(PadStrideInfo conv_info, - const ITensor *w, - const ITensor *in, - ITensor *out, - bool setup_strides = false); - -private: - BorderSize _border_size; - const ITensor *_input; - ITensor *_output; - const ITensor *_weights; - PadStrideInfo _conv_info; - std::unique_ptr _convolver; - unsigned int _num_elems_written_per_iteration; - bool _run_optimized; - unsigned int _depth_multiplier; + BorderSize _border_size; + const ITensor *_input; + ITensor *_output; + const ITensor *_weights; + PadStrideInfo _conv_info; + unsigned int _num_elems_written_per_iteration; + unsigned int _depth_multiplier; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ */ diff --git a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h new file mode 100644 index 0000000000..def395ca1c --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H__ +#define __ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp" + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +/** This class is a wrapper for the depthwise convolution assembly kernels. */ +class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel +{ +public: + const char *name() const override + { + return "NEDepthwiseConvolutionAssemblyKernelWrapper"; + } + + /** Default constructor */ + NEDepthwiseConvolutionAssemblyKernelWrapper() + : _kernel(nullptr) + { + } + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete; + /** Default Move Constructor. */ + NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default; + /** Default move assignment operator */ + NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default; + + /** Initialise the kernel's input and output. + * + * @param[in] kernel Pointer to an assembly kernel implementation. + */ + void configure(depthwise::IDepthwiseConvolution *kernel) + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast(kernel))); + _kernel = kernel; + Window win; + win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1)); + INEKernel::configure(win); + } + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast(_kernel))); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + auto first = window.x().start(); + auto last = window.x().end(); + _kernel->run(first, last, info.thread_id); + } + +private: + depthwise::IDepthwiseConvolution *_kernel; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/convolution/common/activation.hpp b/arm_compute/core/NEON/kernels/convolution/common/activation.hpp new file mode 100644 index 0000000000..091b1652c9 --- /dev/null +++ b/arm_compute/core/NEON/kernels/convolution/common/activation.hpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +namespace neon_convolution_kernels +{ + +enum class ActivationFunction +{ + None, + ReLU, + ReLU6, +}; + +} diff --git a/arm_compute/core/NEON/kernels/convolution/common/padding.hpp b/arm_compute/core/NEON/kernels/convolution/common/padding.hpp new file mode 100644 index 0000000000..33f77d7ee9 --- /dev/null +++ b/arm_compute/core/NEON/kernels/convolution/common/padding.hpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include + +// Utilities for copying tensor tiles and adding/removing padding. +namespace padding +{ + +/* Copy a tile and apply padding to the output copy. + */ +template +void copy_and_pad_tile( + unsigned int tile_rows, + unsigned int tile_cols, + unsigned int n_channels, + const T *inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + T* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride, + unsigned int pad_top, + unsigned int pad_left, + unsigned int pad_bottom, + unsigned int pad_right, + T pad_value=static_cast(0) +); + +/** Copy a tile and remove padding elements in the output. + */ +template +class CopyCropped +{ + public: + static void execute( + size_t size, // Amount of data to copy + const void *inptr, + size_t in_row_stride, + size_t in_col_stride, + void *outptr, + size_t out_row_stride, + size_t out_col_stride, + unsigned int pad_top, + unsigned int pad_left, + unsigned int pad_bottom, + unsigned int pad_right + ); +}; + +} diff --git a/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp new file mode 100644 index 0000000000..6029cb67e3 --- /dev/null +++ b/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once +#include + +namespace qasymm8 +{ + +struct QAsymm8Params +{ + uint8_t quantize(float value) const; + float dequantize(uint8_t value) const; + + uint8_t offset; + float scale; +}; + +struct QAsymm8RescaleParams +{ + static QAsymm8RescaleParams make_rescale_params( + const QAsymm8Params& weight_quant, + const QAsymm8Params& input_quant, + const QAsymm8Params& output_quant + ); + + QAsymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale); + + const int32_t shift, multiplier; + const float rescale; +}; + +} diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp b/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp index 6567eeb23d..ad0a677a8f 100644 --- a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp +++ b/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -54,6 +54,18 @@ struct Tensor4DShape { } + inline int index(const int n, const int i, const int j, const int c) const + { + if (this->ordering == NHWC) + { + return ((n*this->n_rows + i)*this->n_cols + j)*this->n_channels + c; + } + else // NCHW + { + return ((n*this->n_channels + c)*this->n_rows + i)*this->n_cols + j; + } + } + inline int size() const { return n_batches * n_rows * n_cols * n_channels; @@ -94,6 +106,18 @@ struct KernelShape { } + inline int index(int oc, int i, int j, int ic) const + { + if (this->ordering == HWIO) + { + return ((i*this->n_cols + j)*this->n_input_channels + ic)*this->n_output_channels + oc; + } + else // OIHW + { + return ((oc*this->n_input_channels + ic)*this->n_rows + i)*this->n_cols + j; + } + } + inline int size(void) const { return n_output_channels * n_rows * n_cols * n_input_channels; @@ -127,7 +151,16 @@ class Tensor4D final return shape.size() * sizeof(T); } - inline T& element(int, int, int, int) const; + /* Extract an element of the tensor. + * + * If the shape is a Tensor4DShape then the index is given as batch, row, + * column and channel. If the shape is a KernelShape then the index is + * given as output channel, row, column and input channel. + */ + inline T& element(const int a, const int b, const int c, const int d) const + { + return _data[shape.index(a, b, c, d)]; + } inline void Clear() { Fill(static_cast(0)); @@ -143,35 +176,3 @@ class Tensor4D final private: T* const _data; }; - - -template <> -inline float& Tensor4D::element(int n, int i, int j, int c) const -{ - int index; - if (shape.ordering == NHWC) - { - index = ((n*shape.n_rows + i)*shape.n_cols + j)*shape.n_channels + c; - } - else // NCHW - { - index = ((n*shape.n_channels + c)*shape.n_rows + i)*shape.n_cols + j; - } - return _data[index]; -} - - -template <> -inline float& Tensor4D::element(int oc, int i, int j, int ic) const -{ - int index; - if (shape.ordering == HWIO) - { - index = ((i*shape.n_cols + j)*shape.n_input_channels + ic)*shape.n_output_channels + oc; - } - else // OIHW - { - index = ((oc*shape.n_input_channels + ic)*shape.n_rows + i)*shape.n_cols + j; - } - return _data[index]; -} diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp index 6d9cb18f44..45e8da0272 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp @@ -24,42 +24,84 @@ #pragma once +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/activation.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp" + namespace depthwise { +namespace nck = neon_convolution_kernels; + class IDepthwiseConvolution { public: virtual ~IDepthwiseConvolution() = default; - virtual int output_size(const int dim_size, const bool padding_same) const = 0; + virtual int output_size( int dim_size, unsigned int padding_before, unsigned int padding_after ) const = 0; + /* Set input tensor and stride. */ + virtual void set_input(const void *inptr) = 0; + virtual void set_input(const void *inptr, int column_stride) = 0; + virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0; + virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0; + + /* Set output tensor and stride. */ + virtual void set_output(void *outptr) = 0; + virtual void set_output(void *outptr, int column_stride) = 0; + virtual void set_output(void *outptr, int row_stride, int column_stride) = 0; + virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0; + + /* Weights and biases are re-ordered to improve memory access patterns. Use + * these methods to determine the size of the re-pack buffer and to set the + * address (and implicitly reorder the weights and biases into) the buffer. + */ + virtual size_t get_packed_params_size(void) const = 0; + virtual void set_packed_params_buffer(void *) = 0; + + virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0; + virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0; + virtual void pack_params( + void *buffer, + const void* weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases=nullptr + ) const = 0; + + /* Working space is used to pad tensors on the fly. Before running any + * inference check the amount of space required, allocate and provide a + * pointer to the convolution engine. + */ + virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0; + virtual void set_working_space(void *) = 0; + virtual unsigned int get_window(void) const = 0; - virtual void set_offsets(int input_offset, int weights_offset) = 0; - virtual void run(const unsigned int start, const unsigned int stop) = 0; + virtual void run( + unsigned int start, + unsigned int stop, + unsigned int threadid=0 + ) = 0; }; template < - int OutputTileRows, - int OutputTileCols, - int KernelRows, - int KernelCols, - int StrideRows, - int StrideCols, - typename TIn, - typename TOut + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols, + typename TIn, typename TBias, typename TOut, + typename Derived > -class DepthwiseConvolution : public IDepthwiseConvolution +class DepthwiseConvolutionBase : public IDepthwiseConvolution { public: - typedef TIn InputType; - typedef TOut OutputType; - // Information about the specific convolution instance + using InputType = TIn; + using BiasType = TBias; + using OutputType = TOut; static constexpr int output_tile_rows = OutputTileRows; static constexpr int output_tile_cols = OutputTileCols; static constexpr int kernel_rows = KernelRows; @@ -71,260 +113,84 @@ class DepthwiseConvolution : public IDepthwiseConvolution /** Create a new depthwise convolution engine. * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_same True if padding is SAME, else VALID. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, bool padding_same, - const TIn* const weights, - const TIn* const input, - TOut* const output - ) : DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, padding_same, - weights, input, output, 0 /* column stride = default */ - ) - { - } - - /** Create a new depthwise convolution engine. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_top Padding to apply to top of input. - * @param[in] padding_left Padding to apply to left of input. - * @param[in] padding_bottom Padding to apply to bottom of input. - * @param[in] padding_right Padding to apply to right of input. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right, - const TIn* const weights, - const TIn* const input, - TOut* const output - ) : DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - padding_top, padding_left, padding_bottom, padding_right, - weights, input, output, 0 /* column stride = default */ - ) - { - } - - /** Create a new depthwise convolution engine with a specified column stride. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_same True if padding is SAME, else VALID. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - * @param[in] col_stride Stride between columns of the weights, inputs and output tensors. - */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, bool padding_same, - const TIn* const weights, - const TIn* const input, - TOut* const output, - const int col_stride - ) : DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, padding_same, - weights, input, output, - col_stride, 0, /* Weight row stride = default */ - col_stride, 0, 0, /* Input row stride, batch stride = default */ - col_stride, 0, 0 /* Output row stride, batch stride = default */ - ) - { - } - - /** Create a new depthwise convolution engine with a specified column stride. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_top Padding to apply to top of input. - * @param[in] padding_left Padding to apply to left of input. - * @param[in] padding_bottom Padding to apply to bottom of input. - * @param[in] padding_right Padding to apply to right of input. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - * @param[in] col_stride Stride between columns of the weights, inputs and output tensors. - */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right, - const TIn* const weights, - const TIn* const input, - TOut* const output, - const int col_stride - ) : DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - padding_top, padding_left, padding_bottom, padding_right, - weights, input, output, - col_stride, 0, /* Weight row stride = default */ - col_stride, 0, 0, /* Input row stride, batch stride = default */ - col_stride, 0, 0 /* Output row stride, batch stride = default */ - ) - { - } - - /** Create a new depthwise convolution engine. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_same True if padding is SAME, else VALID. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - * @param[in] weight_col_stride Stride between columns of the weights (if 0, defaults appropriately). - * @param[in] weight_row_stride Stride between rows of the weights (if 0, defaults appropriately). - * @param[in] input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately). - * @param[in] input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately). - * @param[in] input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately). - * @param[in] output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately). - * @param[in] output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately). - * @param[in] output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately). - */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, bool padding_same, - const TIn* const weights, - const TIn* const input, - TOut* const output, - int weight_col_stride, - int weight_row_stride, - int input_col_stride, - int input_row_stride, - int input_batch_stride, - int output_col_stride, - int output_row_stride, - int output_batch_stride - ); - - /** Create a new depthwise convolution engine. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_top Padding to apply to top of input. - * @param[in] padding_left Padding to apply to left of input. - * @param[in] padding_bottom Padding to apply to bottom of input. - * @param[in] padding_right Padding to apply to right of input. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - * @param[in] weight_col_stride Stride between columns of the weights (if 0, defaults appropriately). - * @param[in] weight_row_stride Stride between rows of the weights (if 0, defaults appropriately). - * @param[in] input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately). - * @param[in] input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately). - * @param[in] input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately). - * @param[in] output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately). - * @param[in] output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately). - * @param[in] output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately). + * @param[in] n_batches Number of batches tensors. + * @param[in] n_input_rows Number of rows in input tensor. + * @param[in] n_input_cols Number of columns in input tensor. + * @param[in] n_channels Number of channels in input and output tensors. */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, + DepthwiseConvolutionBase( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, unsigned int padding_top, unsigned int padding_left, unsigned int padding_bottom, - unsigned int padding_right, - const TIn* const weights, - const TIn* const input, - TOut* const output, - int weight_col_stride, - int weight_row_stride, - int input_col_stride, - int input_row_stride, - int input_batch_stride, - int output_col_stride, - int output_row_stride, - int output_batch_stride + unsigned int padding_right ); // Cannot copy or move a DepthwiseConvolution. - DepthwiseConvolution(DepthwiseConvolution&) = delete; - DepthwiseConvolution operator=(DepthwiseConvolution&) = delete; + DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete; + DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete; + + /* Set input tensor and stride. */ + void set_input(const void *inptr) override; + void set_input(const void *inptr, int column_stride) override; + void set_input(const void *inptr, int row_stride, int column_stride) override; + void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override; + + /* Set output tensor and stride. */ + void set_output(void *outptr) override; + void set_output(void *outptr, int column_stride) override; + void set_output(void *outptr, int row_stride, int column_stride) override; + void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override; /** Get the number of output rows/columns. * * @param[in] dim_size Number of elements in the dimension (rows/columns) * @param[in] same_padding True if the padding is SAME, otherwise false. */ - static int get_output_size(int dim_size, bool padding_same); static int get_output_size( - int dim_size, - unsigned int padding_before, - unsigned int padding_after + int dim_size, unsigned int padding_before, unsigned int padding_after ); - /** Get the number of output rows/columns. - * - * @param[in] dim_size Number of elements in the dimension (rows/columns) - * @param[in] same_padding True if the padding is SAME, otherwise false. + int output_size( + int dim_size, unsigned int padding_before, unsigned int padding_after + ) const override; + + /* Determine how much memory is required to store the packed weights and + * biases. */ - int output_size(int dim_size, bool padding_same) const override - { - return DepthwiseConvolution< - OutputTileRows, - OutputTileCols, - KernelRows, - KernelCols, - StrideRows, - StrideCols, - TIn, TOut - >::get_output_size(dim_size, padding_same); - } + size_t get_packed_params_size(void) const override; - int output_size( - int dim_size, - unsigned int padding_before, - unsigned int padding_after - ) const override - { - return DepthwiseConvolution< - OutputTileRows, - OutputTileCols, - KernelRows, - KernelCols, - StrideRows, - StrideCols, - TIn, TOut - >::get_output_size(dim_size, padding_before, padding_after); - } - - /** Sets quantization offsets - * - * @param[in] input_offset Input offset - * @param[in] weights_offset Weights offset + /* Set the buffer for the packed weights and biases, and perform the + * packing. + */ + void set_packed_params_buffer(void *buffer) override; + + void pack_params(const void *weights, const void *biases=nullptr) const override; + + void pack_params( + void *buffer, + const void *weights, + const void *biases=nullptr + ) const override; + + void pack_params( + void *buffer, + const void *weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases=nullptr + ) const override; + + /** Query the amount of working space required. + * @param[in] The largest number of threads which will be used to execute + * the kernel. + */ + size_t get_working_space_size(unsigned int n_threads=1) const override; + + /** Set the working space buffer. */ - void set_offsets(int input_offset, int weights_offset) override; + void set_working_space(void *buffer) override; /** Get the window of work to be performed by an instance of the operator. */ @@ -336,122 +202,282 @@ class DepthwiseConvolution : public IDepthwiseConvolution * * @param[in] start Start of the window of work to perform. * @param[in] stop End of the work to perform. + * @param[in] ID of the thread performing the work. */ - void run(unsigned int start, unsigned int stop) override; + void run( + unsigned int start, + unsigned int stop, + unsigned int threadid=0 + ) override; protected: + /** Get the value to use to pad the tensor. + */ + TIn _input_padding_value(void) const; + + /** Implementation of the parameter packing. + */ + void _pack_params( + void *buffer, + const void *weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases=nullptr + ) const; + /** Process a tile-row of the tensors. */ - static void process_tile_row( + void process_tile_row( + unsigned int threadid, int n_channels, - const TIn* const weights, - const int weight_row_stride, - const int weight_col_stride, - const TIn* const inptr, - int in_row_stride, - int in_col_stride, - TOut* const outptr, - int out_row_stride, - int out_col_stride, + const void* packed_params, + const InputType* inptr, + OutputType* outptr, int row_pad_in_top, int row_pad_in_left, int row_pad_in_bottom, int row_pad_out_bottom, int n_tiles, int n_input_cols, - int n_output_cols, - int input_offset, - int weights_offset + int n_output_cols ); - // Determine the maximum (and minimum) padding values which can be applied - // to tiles of the tensors involved in this class of convolution. - static constexpr int max_in_pad_top = (kernel_rows - 1) / 2; - static constexpr int min_in_pad_top = (kernel_rows - stride_rows) / 2; - - static constexpr int max_in_pad_left = (kernel_cols - 1) / 2; - static constexpr int min_in_pad_left = (kernel_cols - stride_cols) / 2; - - static constexpr int max_in_pad_bottom = inner_tile_rows; - static constexpr int max_in_pad_right = inner_tile_cols; - static constexpr int max_out_pad_bottom = output_tile_rows; - static constexpr int max_out_pad_right = output_tile_cols; - - static constexpr int n_in_pad_top_fns = (max_in_pad_top - min_in_pad_top) + 1; - static constexpr int n_in_pad_left_fns = (max_in_pad_left - min_in_pad_left) + 1; - static constexpr int n_in_pad_bottom_fns = max_in_pad_bottom + 1; - static constexpr int n_in_pad_right_fns = max_in_pad_right + 1; - static constexpr int n_out_pad_bottom_fns = max_out_pad_bottom + 1; - static constexpr int n_out_pad_right_fns = max_out_pad_right + 1; - - /** Pointer to a function which will process a tile. + /** Process a single tile of the tensor. * - * @param[in] n_channels Number of channels. - * @param[in] weights Pointer to Height x Width x Channels ordered weights. - * @param[in] inptr Pointer to the top-left unpadded value of the tile. - * @param[in] in_row_stride Stride between rows of the input tensor. - * @param[in] in_col_stride Stride between columns of the input tensor. - * @param[out] outptr Pointer to the top-left output value for the tile. - * @param[in] out_row_stride Stride between rows of the output tensor. - * @param[in] out_col_stride Stride between columns of the output tensor. - * - * The following parameters may be ignored if the function has been - * specialised for specific padding constraints. - * - * @param[in] _in_pad_top Padding to apply to top of input tile. - * @param[in] _in_pad_left Padding to apply to left of input tile. - * @param[in] _in_pad_bottom Padding to apply to bottom of input tile. - * @param[in] _in_pad_right Padding to apply to right of input tile. - * @param[in] _out_pad_bottom Null cells at bottom of output tile. - * @param[in] _out_pad_right Null cells at right of output tile. + * This method will apply input/output padding (if required) and call the + * depthwise tile implementation. */ - typedef void (*TileFn)( + void process_tile( + unsigned int threadid, int n_channels, - const TIn* const weights, - int weight_row_stride, - int weight_col_stride, - const TIn* const inptr, - int in_row_stride, - int in_col_stride, - TOut* const outptr, - int out_row_stride, - int out_col_stride, - int _in_pad_top, - int _in_pad_left, - int _in_pad_bottom, - int _in_pad_right, - int _out_pad_bottom, - int _out_pad_right, - int _input_offset, - int _weights_offset + const void* packed_params, + const InputType* inptr, + OutputType* outptr, + int pad_in_top, + int pad_in_left, + int pad_in_bottom, + int pad_in_right, + int pad_out_bottom, + int pad_out_right ); - /* Arrays of methods to process tensor tiles. - * - * Allows dynamic dispatch to specialized implementations based on - * different padding configurations. + /** Perform depthwise convolution on a single tile. */ - static const TileFn tilefn_unpadded; - static const TileFn tilefn_top[n_in_pad_top_fns]; - static const TileFn tilefn_left[n_in_pad_left_fns]; - static const TileFn tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns]; - static const TileFn tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns]; - static const TileFn tilefn_generic; + template + void execute_tile( + int n_channels, + const void* packed_params, + const InputType* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + OutputType* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); + + int n_channels(void) const; private: // Member variables of instances of a convolution engine. - const TIn* const _weights; - const TIn* const _input; - TOut* const _output; + const InputType* _input; + OutputType* _output; + void* _packed_parameters; + void* _working_space; // Per-thread working space const int _n_batches, _n_input_rows, _n_input_cols, _n_channels, _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols; const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right; + const nck::ActivationFunction _activation; // Stride information for a convolution instance - const int _weight_col_stride, _weight_row_stride; - const int _input_col_stride, _input_row_stride, _input_batch_stride; - const int _output_col_stride, _output_row_stride, _output_batch_stride; - int _input_offset, _weights_offset; + int _input_col_stride, _input_row_stride, _input_batch_stride; + const int _input_ws_col_stride, _input_ws_row_stride; + int _output_col_stride, _output_row_stride, _output_batch_stride; + const int _output_ws_col_stride, _output_ws_row_stride; + + // Methods for getting access to working space + size_t _get_input_working_space_size(void) const; + size_t _get_output_working_space_size(void) const; + + void *_get_input_working_space(unsigned int threadid) const; + void *_get_output_working_space(unsigned int threadid) const; }; + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols, + typename TIn, typename TBias, typename TOut +> +class DepthwiseConvolution : public DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + TIn, TBias, TOut, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + TIn, TBias, TOut + > +> +{ + using Base = DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + TIn, TBias, TOut, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + TIn, TBias, TOut + > >; + friend Base; + using InputType = typename Base::InputType; + using OutputType = typename Base::OutputType; + + public: + using Base::DepthwiseConvolutionBase; + + protected: + template + void execute_tile( + int n_channels, + const void* packed_params, + const TIn* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + TOut* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); +}; + + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +class DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float, float, float +> : public DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float, float, float, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float, float, float + > +> +{ + using Base = DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float, float, float, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float, float, float + > >; + friend Base; + using InputType = typename Base::InputType; + using OutputType = typename Base::OutputType; + + public: + DepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right + ); + + protected: + template + void execute_tile( + int n_channels, + const void* packed_params, + const float* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + float* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); +}; + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +class DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float16_t, float16_t, float16_t +> : public DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float16_t, float16_t, float16_t, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float16_t, float16_t, float16_t + > +> +{ + using Base = DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float16_t, float16_t, float16_t, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float16_t, float16_t, float16_t + > >; + friend Base; + using InputType = typename Base::InputType; + using OutputType = typename Base::OutputType; + + public: + DepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right + ); + + protected: + template + void execute_tile( + int n_channels, + const void* packed_params, + const float16_t* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + float16_t* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); +}; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC } // namespace depthwise diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp new file mode 100644 index 0000000000..4c1d883a70 --- /dev/null +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once +#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp" + +namespace depthwise +{ + +namespace nck = neon_convolution_kernels; + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + uint8_t, int32_t, uint8_t, + QAsymm8DepthwiseConvolution +> +{ + using Base = DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + uint8_t, int32_t, uint8_t, + QAsymm8DepthwiseConvolution + >; + friend Base; + using InputType = typename Base::InputType; + using OutputType = typename Base::OutputType; + + public: + QAsymm8DepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, + const qasymm8::QAsymm8Params& weight_quantisation, + const qasymm8::QAsymm8Params& input_quantisation, + const qasymm8::QAsymm8Params& output_quantisation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right + ); + + QAsymm8DepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, + const qasymm8::QAsymm8Params& weight_quantisation, + const qasymm8::QAsymm8Params& input_quantisation, + const qasymm8::QAsymm8Params& output_quantisation, + const qasymm8::QAsymm8RescaleParams& rescale_parameters, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right + ); + + protected: + static nck::ActivationFunction get_activation_fn( + nck::ActivationFunction activation, + const qasymm8::QAsymm8Params& output_quantisation + ); + + uint8_t _input_padding_value(void) const; + + void _pack_params( + void *buffer, + const void *weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases=nullptr + ) const; + + template + void execute_tile( + int n_channels, + const void* packed_params, + const uint8_t* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + uint8_t* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); + + private: + // Quantization parameters + const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant; + const qasymm8::QAsymm8RescaleParams rescale_parameters; +}; + +} // namespace depthwise diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp index b33f2768ad..674fc4d2df 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp @@ -31,101 +31,73 @@ */ #include +#include #include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp" #pragma once -namespace depthwise -{ +#define MEMBERFN(TOUT) template <\ + unsigned int OutputTileRows, unsigned int OutputTileColumns,\ + unsigned int KernelRows, unsigned int KernelColumns,\ + unsigned int StrideRows, unsigned int StrideColumns,\ + typename TIn, typename TBias, typename TOut,\ + typename Derived\ +> TOUT DepthwiseConvolutionBase<\ + OutputTileRows, OutputTileColumns,\ + KernelRows, KernelColumns,\ + StrideRows, StrideColumns,\ + TIn, TBias, TOut, Derived\ +> -const unsigned int CHANNEL_BLOCK = 16; +using namespace neon_convolution_kernels; -namespace +namespace depthwise { - inline int pad_along_dim( - const bool padding_same, - const int kernel_dim, - const int stride_dim, - const int input_dim - ) - { - if (!padding_same) - return 0; - if (input_dim % stride_dim) - return std::max(kernel_dim - (input_dim % stride_dim), 0); - else - return std::max(kernel_dim - stride_dim, 0); - } -} // namespace -template -int DepthwiseConvolution::get_output_size( - const int dim_size, const bool same_padding -) +template +struct PackParameters { - return iceildiv(dim_size - (same_padding ? 0 : (KC - 1)), SR); -} + static void execute( + unsigned int n_channels, + void *buffer, + const void *weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases + ); +}; -template -int DepthwiseConvolution::get_output_size( +const unsigned int CHANNEL_BLOCK = 16; + +MEMBERFN(int)::get_output_size( const int dim_size, const unsigned int padding_before, const unsigned int padding_after ) { - return iceildiv(dim_size + padding_before + padding_after - KR + 1, SR); + return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows); } -template -DepthwiseConvolution::DepthwiseConvolution( - const int n_batches, const int n_input_rows, const int n_input_cols, - const int n_channels, const bool padding_same, - const TIn* const weights, - const TIn* const input, - TOut* const output, - const int weight_col_stride, - const int weight_row_stride, - const int input_col_stride, - const int input_row_stride, - const int input_batch_stride, - const int output_col_stride, - const int output_row_stride, - const int output_batch_stride -) : DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, - n_channels, - pad_along_dim(padding_same, KR, SR, n_input_rows) / 2, /* top padding */ - pad_along_dim(padding_same, KC, SC, n_input_cols) / 2, /* left padding */ - iceildiv(pad_along_dim(padding_same, KR, SR, n_input_rows), 2), /* bottom padding */ - iceildiv(pad_along_dim(padding_same, KC, SC, n_input_cols), 2), /* right padding */ - weights, input, output, - weight_col_stride, weight_row_stride, - input_col_stride, input_row_stride, input_batch_stride, - output_col_stride, output_row_stride, output_batch_stride -) +MEMBERFN(int)::output_size( + const int dim_size, const unsigned int padding_before, const unsigned int padding_after +) const { + return get_output_size(dim_size, padding_before, padding_after); } - -template -DepthwiseConvolution::DepthwiseConvolution( - const int n_batches, const int n_input_rows, const int n_input_cols, +MEMBERFN()::DepthwiseConvolutionBase( + const int n_batches, + const int n_input_rows, + const int n_input_cols, const int n_channels, + ActivationFunction activation, const unsigned int padding_top, const unsigned int padding_left, const unsigned int padding_bottom, - const unsigned int padding_right, - const TIn* const weights, - const TIn* const input, - TOut* const output, - const int weight_col_stride, - const int weight_row_stride, - const int input_col_stride, - const int input_row_stride, - const int input_batch_stride, - const int output_col_stride, - const int output_row_stride, - const int output_batch_stride -) : _weights(weights), _input(input), _output(output), + const unsigned int padding_right +) : _input(nullptr), _output(nullptr), + _packed_parameters(nullptr), + _working_space(nullptr), _n_batches(n_batches), _n_input_rows(n_input_rows), _n_input_cols(n_input_cols), @@ -138,37 +110,157 @@ DepthwiseConvolution::DepthwiseConvolution( _padding_left(padding_left), _padding_bottom(padding_bottom), _padding_right(padding_right), - _weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels), - _weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride), - _input_col_stride(input_col_stride ? input_col_stride : _n_channels), - _input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride), - _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride), - _output_col_stride(output_col_stride ? output_col_stride : _n_channels), - _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride), - _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride), - _input_offset(0), _weights_offset(0) + _activation(activation), + _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0), + _input_ws_col_stride(_n_channels), + _input_ws_row_stride(_input_ws_col_stride * inner_tile_cols), + _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0), + _output_ws_col_stride(_n_channels), + _output_ws_row_stride(_output_ws_col_stride * OutputTileColumns) { } +MEMBERFN(void)::set_input(const void* const inptr) +{ + set_input(inptr, _n_channels); +} -template -unsigned int DepthwiseConvolution::get_window() const +MEMBERFN(void)::set_input(const void* const inptr, const int ld_col) { - // Parallelise over blocks of channels. - return iceildiv(_n_channels, CHANNEL_BLOCK); + set_input(inptr, _n_input_cols * ld_col, ld_col); +} + +MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col) +{ + set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col); +} + +MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col) +{ + _input = static_cast(inptr); + _input_batch_stride = ld_batch; + _input_row_stride = ld_row; + _input_col_stride = ld_col; +} + +MEMBERFN(void)::set_output(void* const outptr) +{ + set_output(outptr, _n_channels); +} + +MEMBERFN(void)::set_output(void* const outptr, const int ld_col) +{ + set_output(outptr, _n_output_cols * ld_col, ld_col); +} + +MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col) +{ + set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col); +} + +MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col) +{ + _output = static_cast(outptr); + _output_batch_stride = ld_batch; + _output_row_stride = ld_row; + _output_col_stride = ld_col; +} + +MEMBERFN(size_t)::get_packed_params_size(void) const +{ + return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias)); +} + +MEMBERFN(void)::set_packed_params_buffer(void *buffer) +{ + _packed_parameters = buffer; +} + +MEMBERFN(void)::pack_params(const void *weights, const void *biases) const +{ + static_cast(this)->pack_params(_packed_parameters, weights, biases); +} + +MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const +{ + const unsigned int weight_col_stride = _n_channels; + const unsigned int weight_row_stride = KernelColumns * weight_col_stride; + static_cast(this)->pack_params( + buffer, weights, weight_row_stride, weight_col_stride, biases + ); +} + +MEMBERFN(void)::pack_params( + void * const buffer, + const void * const weights, + const unsigned int weight_row_stride, + const unsigned int weight_col_stride, + const void * const biases +) const +{ + static_cast(this)->_pack_params( + buffer, weights, weight_row_stride, weight_col_stride, biases + ); +} + +MEMBERFN(void)::_pack_params( + void * const buffer, + const void * const weights, + const unsigned int weight_row_stride, + const unsigned int weight_col_stride, + const void * const biases +) const +{ + // Default implementation + PackParameters::execute( + _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases + ); +} + +MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const +{ + return nthreads * ( + _get_input_working_space_size() + _get_output_working_space_size() + ); } -template -void DepthwiseConvolution::set_offsets(int input_offset, int weights_offset) +MEMBERFN(void)::set_working_space(void *buffer) { - _input_offset = input_offset; - _weights_offset = weights_offset; + _working_space = buffer; } -template -void DepthwiseConvolution::run( +MEMBERFN(size_t)::_get_input_working_space_size(void) const +{ + return sizeof(TIn) * inner_tile_rows * inner_tile_cols * _n_channels; +} + +MEMBERFN(size_t)::_get_output_working_space_size(void) const +{ + return sizeof(TOut) * OutputTileRows * OutputTileColumns * _n_channels; +} + +MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const +{ + return static_cast(_working_space) + threadid * ( + _get_input_working_space_size() + _get_output_working_space_size() + ); +} + +MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const +{ + return static_cast(_get_input_working_space(threadid)) + _get_input_working_space_size(); +} + +MEMBERFN(unsigned int)::get_window() const +{ + // Parallelise over blocks of channels. + return iceildiv(_n_channels, CHANNEL_BLOCK); +} + +MEMBERFN(void)::run( const unsigned int start, - const unsigned int stop + const unsigned int stop, + const unsigned int threadid ) { // Parallelise over blocks of channels @@ -205,43 +297,38 @@ void DepthwiseConvolution::run( const int output_row_bottom = (tile_i + 1)*output_tile_rows; const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows); + // Get the offset into the packed parameters + const auto params_ptr = static_cast(_packed_parameters) + + start_channel*(sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias)); + // Process the row process_tile_row( + threadid, stop_channel - start_channel, - _weights + start_channel, _weight_row_stride, _weight_col_stride, - inptr_row + start_channel, _input_row_stride, _input_col_stride, - outptr_row + start_channel, _output_row_stride, _output_col_stride, + params_ptr, + inptr_row + start_channel, + outptr_row + start_channel, input_row_pad_top, input_pad_left, input_row_pad_bottom, output_row_pad_bottom, - _n_tile_cols, _n_input_cols, _n_output_cols, - _input_offset, _weights_offset + _n_tile_cols, _n_input_cols, _n_output_cols ); } } } - -template -void DepthwiseConvolution::process_tile_row( +MEMBERFN(void)::process_tile_row( + const unsigned int threadid, const int n_channels, - const TIn* const weights, - const int weight_row_stride, - const int weight_col_stride, + const void* const packed_params, const TIn* const inptr, - const int in_row_stride, - const int in_col_stride, TOut* const outptr, - const int out_row_stride, - const int out_col_stride, const int row_pad_in_top, const int row_pad_in_left, const int row_pad_in_bottom, const int row_pad_out_bottom, const int n_tiles, const int n_input_cols, - const int n_output_cols, - const int input_offset, - const int weights_offset + const int n_output_cols ) { constexpr int tile_overlap = kernel_cols - stride_cols; @@ -261,264 +348,97 @@ void DepthwiseConvolution::process_tile_row // Get pointers into the inputs and outputs const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left; - const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*in_col_stride); - TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride; - - // Apply the specific tile processing function - const bool pad_top = row_pad_in_top > 0; - const bool pad_left = t_pad_in_left > 0; - const bool pad_bottom = row_pad_in_bottom || row_pad_out_bottom; - const bool pad_right = t_pad_in_right || t_pad_out_right; - - const TileFn tilefn = [&] () { - if (!pad_top && !pad_left && !pad_bottom && !pad_right) - { - // No padding - return tilefn_unpadded; - } - else if (pad_top && !pad_left && !pad_bottom && !pad_right) - { - // Padding on the top only, subtract off the minimum expected padding in - // order to index into the array of specialised methods. - const int index = row_pad_in_top - min_in_pad_top; - return tilefn_top[index]; - } - else if (!pad_top && pad_left && !pad_bottom && !pad_right) - { - // Padding on the left only, subtract off the minimum expected padding in - // order to index into the array of specialised methods. - const int index = t_pad_in_left - min_in_pad_left; - return tilefn_left[index]; - } - else if (!pad_top && !pad_left && pad_bottom && !pad_right) - { - // Padding on the bottom only - return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom]; - } - else if (!pad_top && !pad_left && !pad_bottom && pad_right) - { - // Padding on the right only - return tilefn_right[t_pad_in_right][t_pad_out_right]; - } - else - { - // Otherwise use generic tile processing method. - return tilefn_generic; - } - }(); - - tilefn( - n_channels, - weights, weight_row_stride, weight_col_stride, - inptr_col, in_row_stride, in_col_stride, - outptr_col, out_row_stride, out_col_stride, - row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, - row_pad_out_bottom, t_pad_out_right, input_offset, weights_offset + const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride); + TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride; + + // Process just this tile + process_tile( + threadid, n_channels, packed_params, inptr_col, outptr_col, + row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, // Input paddings + row_pad_out_bottom, t_pad_out_right // Output paddings ); } } - -// New templated struct used solely as a way to provide tile processing -// specialisations. -template -struct DepthwiseConvolutionImpl : public DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, TIn, TOut -> +MEMBERFN(TIn)::_input_padding_value(void) const { - typedef DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TOut - > DWC; - - /** Perform the depthwise convolution of a tile. - * - * @param[in] n_channels Number of channels. - * @param[in] weights Pointer to Height x Width x Channels ordered weights. - * @param[in] inptr Pointer to the top-left unpadded value of the tile. - * @param[in] in_row_stride Stride between rows of the input tensor. - * @param[in] in_col_stride Stride between columns of the input tensor. - * @param[out] outptr Pointer to the top-left output value for the tile. - * @param[in] out_row_stride Stride between rows of the output tensor. - * @param[in] out_col_stride Stride between columns of the output tensor. - * - * The following parameters may be ignored if the function has been - * specialised for specific padding constraints. - * - * @param[in] _in_pad_top Padding to apply to top of input tile. - * @param[in] _in_pad_left Padding to apply to left of input tile. - * @param[in] _in_pad_bottom Padding to apply to bottom of input tile. - * @param[in] _in_pad_right Padding to apply to right of input tile. - * @param[in] _out_pad_bottom Null cells at bottom of output tile. - * @param[in] _out_pad_right Null cells at right of output tile. - */ - template < - bool Specialize=false, // Specialize (or not) the method - int InPadTop=0, // If specialized, top padding - int InPadLeft=0, // If specialized, left padding - int InPadBottom=0, // If specialized, bottom padding - int InPadRight=0, // If specialized, right padding - int OutPadBottom=0, // If specialized, bottom output padding - int OutPadRight=0 // If specialized, bottom right padding - > - static void process_tile( - const int n_channels, - const TIn* const weights, - const int weight_row_stride, - const int weight_col_stride, - const TIn* const inptr, - const int in_row_stride, - const int in_col_stride, - TOut* const outptr, - const int out_row_stride, - const int out_col_stride, - const int in_pad_top=0, - const int in_pad_left=0, - const int in_pad_bottom=0, - const int in_pad_right=0, - const int out_pad_bottom=0, - const int out_pad_right=0, - const int input_offset=0, - const int weights_offset=0 - ); -}; - + return static_cast(0); +} -template -template < - bool Specialize, - int InPadTop, int InPadLeft, int InPadBottom, int InPadRight, - int OutPadBottom, int OutPadRight -> -void DepthwiseConvolutionImpl::process_tile( +MEMBERFN(void)::process_tile( + const unsigned int threadid, const int n_channels, - const TIn *__restrict__ const weights, - const int weight_row_stride, - const int weight_col_stride, - const TIn *__restrict__ const inptr, - const int in_row_stride, - const int in_col_stride, - TOut *__restrict__ const outptr, - const int out_row_stride, - const int out_col_stride, - const int _in_pad_top, - const int _in_pad_left, - const int _in_pad_bottom, - const int _in_pad_right, - const int _out_pad_bottom, - const int _out_pad_right, - const int _input_offset, - const int _weights_offset + const void* const packed_params, + const TIn* const inptr, + TOut* const outptr, + const int pad_in_top, + const int pad_in_left, + const int pad_in_bottom, + const int pad_in_right, + const int pad_out_bottom, + const int pad_out_right ) { - constexpr auto inner_tile_rows = DWC::inner_tile_rows; - constexpr auto inner_tile_cols = DWC::inner_tile_cols; - constexpr auto kernel_rows = DWC::kernel_rows; - constexpr auto kernel_cols = DWC::kernel_cols; - constexpr auto output_tile_rows = DWC::output_tile_rows; - constexpr auto output_tile_cols = DWC::output_tile_cols; - constexpr auto stride_rows = DWC::stride_rows; - constexpr auto stride_cols = DWC::stride_cols; - - // Extract parameters - const int in_pad_top = Specialize ? InPadTop : _in_pad_top; - const int in_pad_left = Specialize ? InPadLeft : _in_pad_left; - const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom; - const int in_pad_right = Specialize ? InPadRight : _in_pad_right; - const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom; - const int out_pad_right = Specialize ? OutPadRight : _out_pad_right; - - // Compute valid ranges of the tile - const int in_cells_i = inner_tile_rows - in_pad_bottom; - const int in_cells_j = inner_tile_cols - in_pad_right; - const int out_cells_i = output_tile_rows - out_pad_bottom; - const int out_cells_j = output_tile_cols - out_pad_right; - - // Instantiate pointers - const TIn* __restrict__ inptr_base = inptr; - const TIn* __restrict__ wptr_base = weights; - TOut* __restrict__ outptr_base = outptr; - - // Perform the depthwise convolution - int channels_remaining = n_channels; - for (; channels_remaining; channels_remaining--) + const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right; + const bool pad_output = pad_out_bottom || pad_out_right; + + if (pad_input) { - // Load input tile - TIn u[inner_tile_rows][inner_tile_cols]; - for (int i = 0; i < inner_tile_rows; i++) - { - const TIn* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride; - for (int j = 0; j < inner_tile_cols; j++) - { - if (i < in_pad_top || in_cells_i <= i || - j < in_pad_left || in_cells_j <= j) - { - u[i][j] = static_cast(0); - } - else - { - u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride); - } - } - } - inptr_base++; + // Copy the input into the temporary buffer, applying padding + padding::copy_and_pad_tile( + inner_tile_rows, inner_tile_cols, n_channels, + inptr, _input_row_stride, _input_col_stride, + static_cast(_get_input_working_space(threadid)), _input_ws_row_stride, _input_ws_col_stride, + pad_in_top, pad_in_left, pad_in_bottom, pad_in_right, + static_cast(this)->_input_padding_value() + ); + } - // Load weights tile - TIn w[kernel_rows][kernel_cols]; - for (int i = 0; i < kernel_rows; i++) - { - const TIn* const wptr_row = wptr_base + i*weight_row_stride; - for (int j = 0; j < kernel_cols; j++) - { - w[i][j] = *(wptr_row + j*weight_col_stride); - } - } - wptr_base++; + // Execute the kernel + const TIn * const tile_inptr = !pad_input ? inptr : static_cast(_get_input_working_space(threadid)); + const int in_row_stride = !pad_input ? _input_row_stride : _input_ws_row_stride; + const int in_col_stride = !pad_input ? _input_col_stride : _input_ws_col_stride; - // Perform the convolution - TOut v[output_tile_rows][output_tile_cols]; - for (int out_i = 0; out_i < out_cells_i; out_i++) - { - for (int out_j = 0; out_j < out_cells_j; out_j++) - { - // Clear the accumulator - v[out_i][out_j] = static_cast(0); - - // Base co-ordinate - const int base_i = out_i * stride_rows; - const int base_j = out_j * stride_cols; - - // Fill the accumulator - for (int in_i = 0; in_i < kernel_rows; in_i++) - { - const int i = base_i + in_i; - for (int in_j = 0; in_j < kernel_cols; in_j++) - { - const int j = base_j + in_j; - v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - } - } - } - } + TOut * const tile_outptr = !pad_output ? outptr : static_cast(_get_output_working_space(threadid)); + const int out_row_stride = !pad_output ? _output_row_stride : _output_ws_row_stride; + const int out_col_stride = !pad_output ? _output_col_stride : _output_ws_col_stride; - // Store the output tile - for (int i = 0; i < out_cells_i; i++) - { - TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride; - for (int j = 0; j < out_cells_j; j++) - { - *(outptr_row + j*out_col_stride) = v[i][j]; - } - } - outptr_base++; + Derived * dthis = static_cast(this); + + switch(_activation) + { + case ActivationFunction::ReLU: + dthis->template execute_tile( + n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride + ); + break; + case ActivationFunction::ReLU6: + dthis->template execute_tile( + n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride + ); + break; + default: + dthis->template execute_tile( + n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride + ); + break; } + + if (pad_output) + { + // Copy the output from the temporary buffer, removing unnecessary values + padding::CopyCropped::execute( + n_channels * sizeof(TOut), + _get_output_working_space(threadid), _output_ws_row_stride * sizeof(TOut), _output_ws_col_stride * sizeof(TOut), + outptr, _output_row_stride * sizeof(TOut), _output_col_stride * sizeof(TOut), + 0, 0, pad_out_bottom, pad_out_right + ); + } +} + +MEMBERFN(int)::n_channels(void) const +{ + return _n_channels; } } // namespace depthwise diff --git a/arm_compute/core/utils/misc/InfoHelpers.h b/arm_compute/core/utils/misc/InfoHelpers.h new file mode 100644 index 0000000000..704e178292 --- /dev/null +++ b/arm_compute/core/utils/misc/InfoHelpers.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_MISC_INFO_HELPERS_H__ +#define __ARM_COMPUTE_MISC_INFO_HELPERS_H__ + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +namespace utils +{ +namespace info_helpers +{ +/** Checks if activation information correspond to a relu activation function + * + * @param[in] activation_info Activation metadata + * + * @return True if activation metadata correspond to a relu activation else false + */ +inline bool is_relu(ActivationLayerInfo activation_info) +{ + return activation_info.enabled() && activation_info.activation() == ActivationLayerInfo::ActivationFunction::RELU; +} + +/** Checks if activation information correspond to a relu6 activation function + * + * @param[in] activation_info Activation metadata + * + * @return True if activation metadata correspond to a relu6 activation else false + */ +inline bool is_relu6(ActivationLayerInfo activation_info) +{ + return activation_info.enabled() + && activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU + && activation_info.a() == 6.f && activation_info.b() == 0.f; +} +} // namespace info_helpers +} // namespace utils +} // namespace arm_compute +#endif /* __ARM_COMPUTE_MISC_INFO_HELPERS_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index e2fe11ea7f..28f0560e93 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,6 +37,7 @@ #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEPermute.h" +#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h" #include "arm_compute/runtime/Tensor.h" namespace arm_compute @@ -53,7 +54,15 @@ class NEDepthwiseConvolutionLayer3x3 : public IFunction { public: /** Default constructor */ - NEDepthwiseConvolutionLayer3x3(); + NEDepthwiseConvolutionLayer3x3(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionLayer3x3(const NEDepthwiseConvolutionLayer3x3 &) = delete; + /** Default move constructor */ + NEDepthwiseConvolutionLayer3x3(NEDepthwiseConvolutionLayer3x3 &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionLayer3x3 &operator=(const NEDepthwiseConvolutionLayer3x3 &) = delete; + /** Default move assignment operator */ + NEDepthwiseConvolutionLayer3x3 &operator=(NEDepthwiseConvolutionLayer3x3 &&) = default; /** Initialize the function's source, destination, kernels and border_size. * * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). @@ -86,9 +95,44 @@ public: // Inherited methods overriden: void run() override; + void prepare() override; private: + /** Configure the kernels/functions for the generic pipeline. + * + * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] biases (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info Activation layer information in case of a fused activation. + */ + void configure_generic(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier, const ActivationLayerInfo &act_info); + /** Configure the kernels/functions for the optimized pipeline. + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] biases (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info Activation layer information in case of a fused activation. + */ + void configure_optimized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier, const ActivationLayerInfo &act_info); + /** Run generic kernel */ + void run_generic(); + /** Run optimized function */ + void run_optimized(); + +private: + MemoryGroup _memory_group; NEDepthwiseConvolutionLayer3x3Kernel _dwc_kernel; + NEDepthwiseConvolutionAssemblyDispatch _dwc_optimized_func; NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel; NEFillBorderKernel _border_handler; NEPermute _permute_input; @@ -99,14 +143,14 @@ private: Tensor _permuted_input; Tensor _permuted_weights; Tensor _permuted_output; + const ITensor *_original_weights; bool _has_bias; bool _is_quantized; bool _is_optimized; - bool _are_weights_reshaped; bool _is_nchw; - bool _is_first_run; bool _permute; bool _is_activationlayer_enabled; + bool _is_prepared; }; /** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernels: diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h new file mode 100644 index 0000000000..df8f29d2c7 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__ +#define __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h" +#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp" + +namespace arm_compute +{ +/** Depthwise convolution assembly kernel glue */ +class NEDepthwiseConvolutionAssemblyDispatch : public IFunction +{ +public: + /** Default constructor + * + * @param[in,out] memory_manager Memory manager to use + */ + NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionAssemblyDispatch(const NEDepthwiseConvolutionAssemblyDispatch &) = delete; + /** Default move constructor */ + NEDepthwiseConvolutionAssemblyDispatch(NEDepthwiseConvolutionAssemblyDispatch &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionAssemblyDispatch &operator=(const NEDepthwiseConvolutionAssemblyDispatch &) = delete; + /** Default move assignment operator */ + NEDepthwiseConvolutionAssemblyDispatch &operator=(NEDepthwiseConvolutionAssemblyDispatch &&) = default; + /** Initialize the function's source, destination, kernels and border_size. + * + * @note Supports only NHWC format + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] bias (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionAssemblyDispatch + * + * @note Supports only NHWC format + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] bias (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + * @return An error status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Check if the optimized kernel can be used for the given kernel sizes and strides + * + * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC + * + * @param[in] input Input tensor info. + * @param[in] weights Weights tensor info. + * @param[in] conv_info Convolution layer metadata. + * @param[in] depth_multiplier (Optional) Depth multiplier to be used. + * + * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed. + */ + static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, PadStrideInfo conv_info, unsigned int depth_multiplier = 1); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + const ITensor *_input; + const ITensor *_weights; + const ITensor *_bias; + ITensor *_output; + Tensor _packed_weights; + Tensor _workspace; + bool _is_prepared; + std::unique_ptr _dwc_assembly_kernel; + NEDepthwiseConvolutionAssemblyKernelWrapper _dwc_acl_kernel; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__ */ -- cgit v1.2.1