aboutsummaryrefslogtreecommitdiff
path: root/arm_compute
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-03-11 14:03:23 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-03-29 09:54:53 +0000
commit47d39dc615d1dee2482bc84699802165a9778ac8 (patch)
tree87f2fdb4f4957be7ff1c043be6328e4154cdf9e1 /arm_compute
parent2d2551ed3934f071eb6a65f5b776301454bc147a (diff)
downloadComputeLibrary-47d39dc615d1dee2482bc84699802165a9778ac8.tar.gz
COMPMID-1975: Update depthwise convolution.
Change-Id: Iad58672be35710a7ec2e918653d6d529709387e8 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/898 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'arm_compute')
-rw-r--r--arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h64
-rw-r--r--arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h88
-rw-r--r--arm_compute/core/NEON/kernels/convolution/common/activation.hpp37
-rw-r--r--arm_compute/core/NEON/kernels/convolution/common/padding.hpp74
-rw-r--r--arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp54
-rw-r--r--arm_compute/core/NEON/kernels/convolution/common/tensor.hpp69
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp702
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp118
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp628
-rw-r--r--arm_compute/core/utils/misc/InfoHelpers.h62
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h52
-rw-r--r--arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h117
12 files changed, 1281 insertions, 784 deletions
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
index 64f10b4bd1..87ca4da05b 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,12 +25,10 @@
#define __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__
#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-
-#include <memory>
namespace arm_compute
{
+// Forward declarations
class ITensor;
/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor. */
@@ -60,23 +58,8 @@ public:
* @param[out] output Destination tensor. Data type supported: Same as @p input.
* @param[in] conv_info Padding and stride information to use for the convolution.
* @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] data_layout (Optional) Data layout of the input and weights tensor
*/
- void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, DataLayout data_layout = DataLayout::NCHW);
- /** Static method that checks if optimized execution is supported for the given parameters
- *
- * @param[in] input_shape Input shape
- * @param[in] conv_info Padding and stride information to use for the convolution.
- * @param[in] dt Data type of the input and weights
- * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
- * @param[in] data_layout (Optional) Data layout of the input and weights tensor
- *
- * @return True if the optimized kernels can be executed else false
- */
- static bool is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, unsigned int depth_multiplier = 1, DataLayout data_layout = DataLayout::NCHW);
- /** Generates the convolver object */
- void generate_convolver();
-
+ void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
/** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3Kernel
*
* @note Supported data layouts: NCHW and NHWC
@@ -96,40 +79,13 @@ public:
BorderSize border_size() const override;
private:
- void configure_generic();
- void configure_optimized();
-
- void run_generic(const Window &window, const ThreadInfo &info);
- void run_optimized(const Window &window, const ThreadInfo &info);
- /** Creates an optimized backend convolver object
- *
- * @note Convolver of strides 1,2 and convolution size of 3 is currently supported
- *
- * @param[in] conv_info Padding and stride information to use for the convolution
- * @param[in] w Weights tensor
- * @param[in] in Input tensor
- * @param[in] out Output tensor
- * @param[in] setup_strides (Optional) Boolean to enable setting the strides of the tensors
- * in the convolver in case of padding. Defaults to false
- *
- * @return A convolver object or nullptr if the configuration is not supported
- */
- std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver_object(PadStrideInfo conv_info,
- const ITensor *w,
- const ITensor *in,
- ITensor *out,
- bool setup_strides = false);
-
-private:
- BorderSize _border_size;
- const ITensor *_input;
- ITensor *_output;
- const ITensor *_weights;
- PadStrideInfo _conv_info;
- std::unique_ptr<depthwise::IDepthwiseConvolution> _convolver;
- unsigned int _num_elems_written_per_iteration;
- bool _run_optimized;
- unsigned int _depth_multiplier;
+ BorderSize _border_size;
+ const ITensor *_input;
+ ITensor *_output;
+ const ITensor *_weights;
+ PadStrideInfo _conv_info;
+ unsigned int _num_elems_written_per_iteration;
+ unsigned int _depth_multiplier;
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ */
diff --git a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
new file mode 100644
index 0000000000..def395ca1c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H__
+#define __ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** This class is a wrapper for the depthwise convolution assembly kernels. */
+class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel
+{
+public:
+ const char *name() const override
+ {
+ return "NEDepthwiseConvolutionAssemblyKernelWrapper";
+ }
+
+ /** Default constructor */
+ NEDepthwiseConvolutionAssemblyKernelWrapper()
+ : _kernel(nullptr)
+ {
+ }
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
+ /** Default Move Constructor. */
+ NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
+ /** Default move assignment operator */
+ NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
+
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] kernel Pointer to an assembly kernel implementation.
+ */
+ void configure(depthwise::IDepthwiseConvolution *kernel)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
+ _kernel = kernel;
+ Window win;
+ win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1));
+ INEKernel::configure(win);
+ }
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ auto first = window.x().start();
+ auto last = window.x().end();
+ _kernel->run(first, last, info.thread_id);
+ }
+
+private:
+ depthwise::IDepthwiseConvolution *_kernel;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/convolution/common/activation.hpp b/arm_compute/core/NEON/kernels/convolution/common/activation.hpp
new file mode 100644
index 0000000000..091b1652c9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/common/activation.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace neon_convolution_kernels
+{
+
+enum class ActivationFunction
+{
+ None,
+ ReLU,
+ ReLU6,
+};
+
+}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/padding.hpp b/arm_compute/core/NEON/kernels/convolution/common/padding.hpp
new file mode 100644
index 0000000000..33f77d7ee9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/common/padding.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+// Utilities for copying tensor tiles and adding/removing padding.
+namespace padding
+{
+
+/* Copy a tile and apply padding to the output copy.
+ */
+template <typename T>
+void copy_and_pad_tile(
+ unsigned int tile_rows,
+ unsigned int tile_cols,
+ unsigned int n_channels,
+ const T *inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ T* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride,
+ unsigned int pad_top,
+ unsigned int pad_left,
+ unsigned int pad_bottom,
+ unsigned int pad_right,
+ T pad_value=static_cast<T>(0)
+);
+
+/** Copy a tile and remove padding elements in the output.
+ */
+template <unsigned int TileRows, unsigned int TileCols>
+class CopyCropped
+{
+ public:
+ static void execute(
+ size_t size, // Amount of data to copy
+ const void *inptr,
+ size_t in_row_stride,
+ size_t in_col_stride,
+ void *outptr,
+ size_t out_row_stride,
+ size_t out_col_stride,
+ unsigned int pad_top,
+ unsigned int pad_left,
+ unsigned int pad_bottom,
+ unsigned int pad_right
+ );
+};
+
+}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
new file mode 100644
index 0000000000..6029cb67e3
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include <cstdint>
+
+namespace qasymm8
+{
+
+struct QAsymm8Params
+{
+ uint8_t quantize(float value) const;
+ float dequantize(uint8_t value) const;
+
+ uint8_t offset;
+ float scale;
+};
+
+struct QAsymm8RescaleParams
+{
+ static QAsymm8RescaleParams make_rescale_params(
+ const QAsymm8Params& weight_quant,
+ const QAsymm8Params& input_quant,
+ const QAsymm8Params& output_quant
+ );
+
+ QAsymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale);
+
+ const int32_t shift, multiplier;
+ const float rescale;
+};
+
+}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp b/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
index 6567eeb23d..ad0a677a8f 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,6 +54,18 @@ struct Tensor4DShape
{
}
+ inline int index(const int n, const int i, const int j, const int c) const
+ {
+ if (this->ordering == NHWC)
+ {
+ return ((n*this->n_rows + i)*this->n_cols + j)*this->n_channels + c;
+ }
+ else // NCHW
+ {
+ return ((n*this->n_channels + c)*this->n_rows + i)*this->n_cols + j;
+ }
+ }
+
inline int size() const
{
return n_batches * n_rows * n_cols * n_channels;
@@ -94,6 +106,18 @@ struct KernelShape
{
}
+ inline int index(int oc, int i, int j, int ic) const
+ {
+ if (this->ordering == HWIO)
+ {
+ return ((i*this->n_cols + j)*this->n_input_channels + ic)*this->n_output_channels + oc;
+ }
+ else // OIHW
+ {
+ return ((oc*this->n_input_channels + ic)*this->n_rows + i)*this->n_cols + j;
+ }
+ }
+
inline int size(void) const
{
return n_output_channels * n_rows * n_cols * n_input_channels;
@@ -127,7 +151,16 @@ class Tensor4D final
return shape.size() * sizeof(T);
}
- inline T& element(int, int, int, int) const;
+ /* Extract an element of the tensor.
+ *
+ * If the shape is a Tensor4DShape then the index is given as batch, row,
+ * column and channel. If the shape is a KernelShape then the index is
+ * given as output channel, row, column and input channel.
+ */
+ inline T& element(const int a, const int b, const int c, const int d) const
+ {
+ return _data[shape.index(a, b, c, d)];
+ }
inline void Clear() {
Fill(static_cast<T>(0));
@@ -143,35 +176,3 @@ class Tensor4D final
private:
T* const _data;
};
-
-
-template <>
-inline float& Tensor4D<Tensor4DShape, float>::element(int n, int i, int j, int c) const
-{
- int index;
- if (shape.ordering == NHWC)
- {
- index = ((n*shape.n_rows + i)*shape.n_cols + j)*shape.n_channels + c;
- }
- else // NCHW
- {
- index = ((n*shape.n_channels + c)*shape.n_rows + i)*shape.n_cols + j;
- }
- return _data[index];
-}
-
-
-template <>
-inline float& Tensor4D<KernelShape, float>::element(int oc, int i, int j, int ic) const
-{
- int index;
- if (shape.ordering == HWIO)
- {
- index = ((i*shape.n_cols + j)*shape.n_input_channels + ic)*shape.n_output_channels + oc;
- }
- else // OIHW
- {
- index = ((oc*shape.n_input_channels + ic)*shape.n_rows + i)*shape.n_cols + j;
- }
- return _data[index];
-}
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
index 6d9cb18f44..45e8da0272 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
@@ -24,42 +24,84 @@
#pragma once
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/activation.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
+
namespace depthwise
{
+namespace nck = neon_convolution_kernels;
+
class IDepthwiseConvolution
{
public:
virtual ~IDepthwiseConvolution() = default;
- virtual int output_size(const int dim_size, const bool padding_same) const = 0;
+
virtual int output_size(
int dim_size,
unsigned int padding_before,
unsigned int padding_after
) const = 0;
+ /* Set input tensor and stride. */
+ virtual void set_input(const void *inptr) = 0;
+ virtual void set_input(const void *inptr, int column_stride) = 0;
+ virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
+ virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
+
+ /* Set output tensor and stride. */
+ virtual void set_output(void *outptr) = 0;
+ virtual void set_output(void *outptr, int column_stride) = 0;
+ virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
+ virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
+
+ /* Weights and biases are re-ordered to improve memory access patterns. Use
+ * these methods to determine the size of the re-pack buffer and to set the
+ * address (and implicitly reorder the weights and biases into) the buffer.
+ */
+ virtual size_t get_packed_params_size(void) const = 0;
+ virtual void set_packed_params_buffer(void *) = 0;
+
+ virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
+ virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
+ virtual void pack_params(
+ void *buffer,
+ const void* weights,
+ unsigned int weight_row_stride,
+ unsigned int weight_col_stride,
+ const void *biases=nullptr
+ ) const = 0;
+
+ /* Working space is used to pad tensors on the fly. Before running any
+ * inference check the amount of space required, allocate and provide a
+ * pointer to the convolution engine.
+ */
+ virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
+ virtual void set_working_space(void *) = 0;
+
virtual unsigned int get_window(void) const = 0;
- virtual void set_offsets(int input_offset, int weights_offset) = 0;
- virtual void run(const unsigned int start, const unsigned int stop) = 0;
+ virtual void run(
+ unsigned int start,
+ unsigned int stop,
+ unsigned int threadid=0
+ ) = 0;
};
template <
- int OutputTileRows,
- int OutputTileCols,
- int KernelRows,
- int KernelCols,
- int StrideRows,
- int StrideCols,
- typename TIn,
- typename TOut
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols,
+ typename TIn, typename TBias, typename TOut,
+ typename Derived
>
-class DepthwiseConvolution : public IDepthwiseConvolution
+class DepthwiseConvolutionBase : public IDepthwiseConvolution
{
public:
- typedef TIn InputType;
- typedef TOut OutputType;
-
// Information about the specific convolution instance
+ using InputType = TIn;
+ using BiasType = TBias;
+ using OutputType = TOut;
static constexpr int output_tile_rows = OutputTileRows;
static constexpr int output_tile_cols = OutputTileCols;
static constexpr int kernel_rows = KernelRows;
@@ -71,260 +113,84 @@ class DepthwiseConvolution : public IDepthwiseConvolution
/** Create a new depthwise convolution engine.
*
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_same True if padding is SAME, else VALID.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- */
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels, bool padding_same,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output
- ) : DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels, padding_same,
- weights, input, output, 0 /* column stride = default */
- )
- {
- }
-
- /** Create a new depthwise convolution engine.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_top Padding to apply to top of input.
- * @param[in] padding_left Padding to apply to left of input.
- * @param[in] padding_bottom Padding to apply to bottom of input.
- * @param[in] padding_right Padding to apply to right of input.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- */
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output
- ) : DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels,
- padding_top, padding_left, padding_bottom, padding_right,
- weights, input, output, 0 /* column stride = default */
- )
- {
- }
-
- /** Create a new depthwise convolution engine with a specified column stride.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_same True if padding is SAME, else VALID.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- * @param[in] col_stride Stride between columns of the weights, inputs and output tensors.
- */
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels, bool padding_same,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output,
- const int col_stride
- ) : DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels, padding_same,
- weights, input, output,
- col_stride, 0, /* Weight row stride = default */
- col_stride, 0, 0, /* Input row stride, batch stride = default */
- col_stride, 0, 0 /* Output row stride, batch stride = default */
- )
- {
- }
-
- /** Create a new depthwise convolution engine with a specified column stride.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_top Padding to apply to top of input.
- * @param[in] padding_left Padding to apply to left of input.
- * @param[in] padding_bottom Padding to apply to bottom of input.
- * @param[in] padding_right Padding to apply to right of input.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- * @param[in] col_stride Stride between columns of the weights, inputs and output tensors.
- */
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output,
- const int col_stride
- ) : DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels,
- padding_top, padding_left, padding_bottom, padding_right,
- weights, input, output,
- col_stride, 0, /* Weight row stride = default */
- col_stride, 0, 0, /* Input row stride, batch stride = default */
- col_stride, 0, 0 /* Output row stride, batch stride = default */
- )
- {
- }
-
- /** Create a new depthwise convolution engine.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_same True if padding is SAME, else VALID.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- * @param[in] weight_col_stride Stride between columns of the weights (if 0, defaults appropriately).
- * @param[in] weight_row_stride Stride between rows of the weights (if 0, defaults appropriately).
- * @param[in] input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately).
- * @param[in] input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately).
- * @param[in] input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately).
- * @param[in] output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately).
- * @param[in] output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately).
- * @param[in] output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately).
- */
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels, bool padding_same,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output,
- int weight_col_stride,
- int weight_row_stride,
- int input_col_stride,
- int input_row_stride,
- int input_batch_stride,
- int output_col_stride,
- int output_row_stride,
- int output_batch_stride
- );
-
- /** Create a new depthwise convolution engine.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_top Padding to apply to top of input.
- * @param[in] padding_left Padding to apply to left of input.
- * @param[in] padding_bottom Padding to apply to bottom of input.
- * @param[in] padding_right Padding to apply to right of input.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- * @param[in] weight_col_stride Stride between columns of the weights (if 0, defaults appropriately).
- * @param[in] weight_row_stride Stride between rows of the weights (if 0, defaults appropriately).
- * @param[in] input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately).
- * @param[in] input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately).
- * @param[in] input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately).
- * @param[in] output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately).
- * @param[in] output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately).
- * @param[in] output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately).
+ * @param[in] n_batches Number of batches tensors.
+ * @param[in] n_input_rows Number of rows in input tensor.
+ * @param[in] n_input_cols Number of columns in input tensor.
+ * @param[in] n_channels Number of channels in input and output tensors.
*/
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels,
+ DepthwiseConvolutionBase(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ nck::ActivationFunction activation,
unsigned int padding_top,
unsigned int padding_left,
unsigned int padding_bottom,
- unsigned int padding_right,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output,
- int weight_col_stride,
- int weight_row_stride,
- int input_col_stride,
- int input_row_stride,
- int input_batch_stride,
- int output_col_stride,
- int output_row_stride,
- int output_batch_stride
+ unsigned int padding_right
);
// Cannot copy or move a DepthwiseConvolution.
- DepthwiseConvolution(DepthwiseConvolution&) = delete;
- DepthwiseConvolution operator=(DepthwiseConvolution&) = delete;
+ DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
+ DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
+
+ /* Set input tensor and stride. */
+ void set_input(const void *inptr) override;
+ void set_input(const void *inptr, int column_stride) override;
+ void set_input(const void *inptr, int row_stride, int column_stride) override;
+ void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
+
+ /* Set output tensor and stride. */
+ void set_output(void *outptr) override;
+ void set_output(void *outptr, int column_stride) override;
+ void set_output(void *outptr, int row_stride, int column_stride) override;
+ void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
/** Get the number of output rows/columns.
*
* @param[in] dim_size Number of elements in the dimension (rows/columns)
* @param[in] same_padding True if the padding is SAME, otherwise false.
*/
- static int get_output_size(int dim_size, bool padding_same);
static int get_output_size(
- int dim_size,
- unsigned int padding_before,
- unsigned int padding_after
+ int dim_size, unsigned int padding_before, unsigned int padding_after
);
- /** Get the number of output rows/columns.
- *
- * @param[in] dim_size Number of elements in the dimension (rows/columns)
- * @param[in] same_padding True if the padding is SAME, otherwise false.
+ int output_size(
+ int dim_size, unsigned int padding_before, unsigned int padding_after
+ ) const override;
+
+ /* Determine how much memory is required to store the packed weights and
+ * biases.
*/
- int output_size(int dim_size, bool padding_same) const override
- {
- return DepthwiseConvolution<
- OutputTileRows,
- OutputTileCols,
- KernelRows,
- KernelCols,
- StrideRows,
- StrideCols,
- TIn, TOut
- >::get_output_size(dim_size, padding_same);
- }
+ size_t get_packed_params_size(void) const override;
- int output_size(
- int dim_size,
- unsigned int padding_before,
- unsigned int padding_after
- ) const override
- {
- return DepthwiseConvolution<
- OutputTileRows,
- OutputTileCols,
- KernelRows,
- KernelCols,
- StrideRows,
- StrideCols,
- TIn, TOut
- >::get_output_size(dim_size, padding_before, padding_after);
- }
-
- /** Sets quantization offsets
- *
- * @param[in] input_offset Input offset
- * @param[in] weights_offset Weights offset
+ /* Set the buffer for the packed weights and biases, and perform the
+ * packing.
+ */
+ void set_packed_params_buffer(void *buffer) override;
+
+ void pack_params(const void *weights, const void *biases=nullptr) const override;
+
+ void pack_params(
+ void *buffer,
+ const void *weights,
+ const void *biases=nullptr
+ ) const override;
+
+ void pack_params(
+ void *buffer,
+ const void *weights,
+ unsigned int weight_row_stride,
+ unsigned int weight_col_stride,
+ const void *biases=nullptr
+ ) const override;
+
+ /** Query the amount of working space required.
+ * @param[in] The largest number of threads which will be used to execute
+ * the kernel.
+ */
+ size_t get_working_space_size(unsigned int n_threads=1) const override;
+
+ /** Set the working space buffer.
*/
- void set_offsets(int input_offset, int weights_offset) override;
+ void set_working_space(void *buffer) override;
/** Get the window of work to be performed by an instance of the operator.
*/
@@ -336,122 +202,282 @@ class DepthwiseConvolution : public IDepthwiseConvolution
*
* @param[in] start Start of the window of work to perform.
* @param[in] stop End of the work to perform.
+ * @param[in] ID of the thread performing the work.
*/
- void run(unsigned int start, unsigned int stop) override;
+ void run(
+ unsigned int start,
+ unsigned int stop,
+ unsigned int threadid=0
+ ) override;
protected:
+ /** Get the value to use to pad the tensor.
+ */
+ TIn _input_padding_value(void) const;
+
+ /** Implementation of the parameter packing.
+ */
+ void _pack_params(
+ void *buffer,
+ const void *weights,
+ unsigned int weight_row_stride,
+ unsigned int weight_col_stride,
+ const void *biases=nullptr
+ ) const;
+
/** Process a tile-row of the tensors.
*/
- static void process_tile_row(
+ void process_tile_row(
+ unsigned int threadid,
int n_channels,
- const TIn* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const TIn* const inptr,
- int in_row_stride,
- int in_col_stride,
- TOut* const outptr,
- int out_row_stride,
- int out_col_stride,
+ const void* packed_params,
+ const InputType* inptr,
+ OutputType* outptr,
int row_pad_in_top,
int row_pad_in_left,
int row_pad_in_bottom,
int row_pad_out_bottom,
int n_tiles,
int n_input_cols,
- int n_output_cols,
- int input_offset,
- int weights_offset
+ int n_output_cols
);
- // Determine the maximum (and minimum) padding values which can be applied
- // to tiles of the tensors involved in this class of convolution.
- static constexpr int max_in_pad_top = (kernel_rows - 1) / 2;
- static constexpr int min_in_pad_top = (kernel_rows - stride_rows) / 2;
-
- static constexpr int max_in_pad_left = (kernel_cols - 1) / 2;
- static constexpr int min_in_pad_left = (kernel_cols - stride_cols) / 2;
-
- static constexpr int max_in_pad_bottom = inner_tile_rows;
- static constexpr int max_in_pad_right = inner_tile_cols;
- static constexpr int max_out_pad_bottom = output_tile_rows;
- static constexpr int max_out_pad_right = output_tile_cols;
-
- static constexpr int n_in_pad_top_fns = (max_in_pad_top - min_in_pad_top) + 1;
- static constexpr int n_in_pad_left_fns = (max_in_pad_left - min_in_pad_left) + 1;
- static constexpr int n_in_pad_bottom_fns = max_in_pad_bottom + 1;
- static constexpr int n_in_pad_right_fns = max_in_pad_right + 1;
- static constexpr int n_out_pad_bottom_fns = max_out_pad_bottom + 1;
- static constexpr int n_out_pad_right_fns = max_out_pad_right + 1;
-
- /** Pointer to a function which will process a tile.
+ /** Process a single tile of the tensor.
*
- * @param[in] n_channels Number of channels.
- * @param[in] weights Pointer to Height x Width x Channels ordered weights.
- * @param[in] inptr Pointer to the top-left unpadded value of the tile.
- * @param[in] in_row_stride Stride between rows of the input tensor.
- * @param[in] in_col_stride Stride between columns of the input tensor.
- * @param[out] outptr Pointer to the top-left output value for the tile.
- * @param[in] out_row_stride Stride between rows of the output tensor.
- * @param[in] out_col_stride Stride between columns of the output tensor.
- *
- * The following parameters may be ignored if the function has been
- * specialised for specific padding constraints.
- *
- * @param[in] _in_pad_top Padding to apply to top of input tile.
- * @param[in] _in_pad_left Padding to apply to left of input tile.
- * @param[in] _in_pad_bottom Padding to apply to bottom of input tile.
- * @param[in] _in_pad_right Padding to apply to right of input tile.
- * @param[in] _out_pad_bottom Null cells at bottom of output tile.
- * @param[in] _out_pad_right Null cells at right of output tile.
+ * This method will apply input/output padding (if required) and call the
+ * depthwise tile implementation.
*/
- typedef void (*TileFn)(
+ void process_tile(
+ unsigned int threadid,
int n_channels,
- const TIn* const weights,
- int weight_row_stride,
- int weight_col_stride,
- const TIn* const inptr,
- int in_row_stride,
- int in_col_stride,
- TOut* const outptr,
- int out_row_stride,
- int out_col_stride,
- int _in_pad_top,
- int _in_pad_left,
- int _in_pad_bottom,
- int _in_pad_right,
- int _out_pad_bottom,
- int _out_pad_right,
- int _input_offset,
- int _weights_offset
+ const void* packed_params,
+ const InputType* inptr,
+ OutputType* outptr,
+ int pad_in_top,
+ int pad_in_left,
+ int pad_in_bottom,
+ int pad_in_right,
+ int pad_out_bottom,
+ int pad_out_right
);
- /* Arrays of methods to process tensor tiles.
- *
- * Allows dynamic dispatch to specialized implementations based on
- * different padding configurations.
+ /** Perform depthwise convolution on a single tile.
*/
- static const TileFn tilefn_unpadded;
- static const TileFn tilefn_top[n_in_pad_top_fns];
- static const TileFn tilefn_left[n_in_pad_left_fns];
- static const TileFn tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns];
- static const TileFn tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns];
- static const TileFn tilefn_generic;
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const InputType* inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ OutputType* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride
+ );
+
+ int n_channels(void) const;
private:
// Member variables of instances of a convolution engine.
- const TIn* const _weights;
- const TIn* const _input;
- TOut* const _output;
+ const InputType* _input;
+ OutputType* _output;
+ void* _packed_parameters;
+ void* _working_space; // Per-thread working space
const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
_n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
+ const nck::ActivationFunction _activation;
// Stride information for a convolution instance
- const int _weight_col_stride, _weight_row_stride;
- const int _input_col_stride, _input_row_stride, _input_batch_stride;
- const int _output_col_stride, _output_row_stride, _output_batch_stride;
- int _input_offset, _weights_offset;
+ int _input_col_stride, _input_row_stride, _input_batch_stride;
+ const int _input_ws_col_stride, _input_ws_row_stride;
+ int _output_col_stride, _output_row_stride, _output_batch_stride;
+ const int _output_ws_col_stride, _output_ws_row_stride;
+
+ // Methods for getting access to working space
+ size_t _get_input_working_space_size(void) const;
+ size_t _get_output_working_space_size(void) const;
+
+ void *_get_input_working_space(unsigned int threadid) const;
+ void *_get_output_working_space(unsigned int threadid) const;
};
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols,
+ typename TIn, typename TBias, typename TOut
+>
+class DepthwiseConvolution : public DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ TIn, TBias, TOut,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ TIn, TBias, TOut
+ >
+>
+{
+ using Base = DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ TIn, TBias, TOut,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ TIn, TBias, TOut
+ > >;
+ friend Base;
+ using InputType = typename Base::InputType;
+ using OutputType = typename Base::OutputType;
+
+ public:
+ using Base::DepthwiseConvolutionBase;
+
+ protected:
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const TIn* inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ TOut* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride
+ );
+};
+
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+class DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float, float
+> : public DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float, float,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float, float
+ >
+>
+{
+ using Base = DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float, float,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float, float
+ > >;
+ friend Base;
+ using InputType = typename Base::InputType;
+ using OutputType = typename Base::OutputType;
+
+ public:
+ DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ nck::ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
+ protected:
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const float* inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ float* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride
+ );
+};
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+class DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+> : public DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t, float16_t,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+ >
+>
+{
+ using Base = DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t, float16_t,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+ > >;
+ friend Base;
+ using InputType = typename Base::InputType;
+ using OutputType = typename Base::OutputType;
+
+ public:
+ DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ nck::ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
+ protected:
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const float16_t* inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ float16_t* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride
+ );
+};
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
} // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
new file mode 100644
index 0000000000..4c1d883a70
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp"
+
+namespace depthwise
+{
+
+namespace nck = neon_convolution_kernels;
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ uint8_t, int32_t, uint8_t,
+ QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
+>
+{
+ using Base = DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ uint8_t, int32_t, uint8_t,
+ QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
+ >;
+ friend Base;
+ using InputType = typename Base::InputType;
+ using OutputType = typename Base::OutputType;
+
+ public:
+ QAsymm8DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params& weight_quantisation,
+ const qasymm8::QAsymm8Params& input_quantisation,
+ const qasymm8::QAsymm8Params& output_quantisation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
+ QAsymm8DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params& weight_quantisation,
+ const qasymm8::QAsymm8Params& input_quantisation,
+ const qasymm8::QAsymm8Params& output_quantisation,
+ const qasymm8::QAsymm8RescaleParams& rescale_parameters,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
+ protected:
+ static nck::ActivationFunction get_activation_fn(
+ nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params& output_quantisation
+ );
+
+ uint8_t _input_padding_value(void) const;
+
+ void _pack_params(
+ void *buffer,
+ const void *weights,
+ unsigned int weight_row_stride,
+ unsigned int weight_col_stride,
+ const void *biases=nullptr
+ ) const;
+
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const uint8_t* inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ uint8_t* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride
+ );
+
+ private:
+ // Quantization parameters
+ const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
+ const qasymm8::QAsymm8RescaleParams rescale_parameters;
+};
+
+} // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
index b33f2768ad..674fc4d2df 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
@@ -31,101 +31,73 @@
*/
#include <algorithm>
+#include <cstdint>
#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
#pragma once
-namespace depthwise
-{
+#define MEMBERFN(TOUT) template <\
+ unsigned int OutputTileRows, unsigned int OutputTileColumns,\
+ unsigned int KernelRows, unsigned int KernelColumns,\
+ unsigned int StrideRows, unsigned int StrideColumns,\
+ typename TIn, typename TBias, typename TOut,\
+ typename Derived\
+> TOUT DepthwiseConvolutionBase<\
+ OutputTileRows, OutputTileColumns,\
+ KernelRows, KernelColumns,\
+ StrideRows, StrideColumns,\
+ TIn, TBias, TOut, Derived\
+>
-const unsigned int CHANNEL_BLOCK = 16;
+using namespace neon_convolution_kernels;
-namespace
+namespace depthwise
{
- inline int pad_along_dim(
- const bool padding_same,
- const int kernel_dim,
- const int stride_dim,
- const int input_dim
- )
- {
- if (!padding_same)
- return 0;
- if (input_dim % stride_dim)
- return std::max(kernel_dim - (input_dim % stride_dim), 0);
- else
- return std::max(kernel_dim - stride_dim, 0);
- }
-} // namespace
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
- const int dim_size, const bool same_padding
-)
+template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
+struct PackParameters
{
- return iceildiv(dim_size - (same_padding ? 0 : (KC - 1)), SR);
-}
+ static void execute(
+ unsigned int n_channels,
+ void *buffer,
+ const void *weights,
+ unsigned int weight_row_stride,
+ unsigned int weight_col_stride,
+ const void *biases
+ );
+};
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
+const unsigned int CHANNEL_BLOCK = 16;
+
+MEMBERFN(int)::get_output_size(
const int dim_size, const unsigned int padding_before, const unsigned int padding_after
)
{
- return iceildiv(dim_size + padding_before + padding_after - KR + 1, SR);
+ return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows);
}
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
- const int n_batches, const int n_input_rows, const int n_input_cols,
- const int n_channels, const bool padding_same,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output,
- const int weight_col_stride,
- const int weight_row_stride,
- const int input_col_stride,
- const int input_row_stride,
- const int input_batch_stride,
- const int output_col_stride,
- const int output_row_stride,
- const int output_batch_stride
-) : DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>(
- n_batches, n_input_rows, n_input_cols,
- n_channels,
- pad_along_dim(padding_same, KR, SR, n_input_rows) / 2, /* top padding */
- pad_along_dim(padding_same, KC, SC, n_input_cols) / 2, /* left padding */
- iceildiv(pad_along_dim(padding_same, KR, SR, n_input_rows), 2), /* bottom padding */
- iceildiv(pad_along_dim(padding_same, KC, SC, n_input_cols), 2), /* right padding */
- weights, input, output,
- weight_col_stride, weight_row_stride,
- input_col_stride, input_row_stride, input_batch_stride,
- output_col_stride, output_row_stride, output_batch_stride
-)
+MEMBERFN(int)::output_size(
+ const int dim_size, const unsigned int padding_before, const unsigned int padding_after
+) const
{
+ return get_output_size(dim_size, padding_before, padding_after);
}
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
- const int n_batches, const int n_input_rows, const int n_input_cols,
+MEMBERFN()::DepthwiseConvolutionBase(
+ const int n_batches,
+ const int n_input_rows,
+ const int n_input_cols,
const int n_channels,
+ ActivationFunction activation,
const unsigned int padding_top,
const unsigned int padding_left,
const unsigned int padding_bottom,
- const unsigned int padding_right,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output,
- const int weight_col_stride,
- const int weight_row_stride,
- const int input_col_stride,
- const int input_row_stride,
- const int input_batch_stride,
- const int output_col_stride,
- const int output_row_stride,
- const int output_batch_stride
-) : _weights(weights), _input(input), _output(output),
+ const unsigned int padding_right
+) : _input(nullptr), _output(nullptr),
+ _packed_parameters(nullptr),
+ _working_space(nullptr),
_n_batches(n_batches),
_n_input_rows(n_input_rows),
_n_input_cols(n_input_cols),
@@ -138,37 +110,157 @@ DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
_padding_left(padding_left),
_padding_bottom(padding_bottom),
_padding_right(padding_right),
- _weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels),
- _weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride),
- _input_col_stride(input_col_stride ? input_col_stride : _n_channels),
- _input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride),
- _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride),
- _output_col_stride(output_col_stride ? output_col_stride : _n_channels),
- _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride),
- _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride),
- _input_offset(0), _weights_offset(0)
+ _activation(activation),
+ _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0),
+ _input_ws_col_stride(_n_channels),
+ _input_ws_row_stride(_input_ws_col_stride * inner_tile_cols),
+ _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0),
+ _output_ws_col_stride(_n_channels),
+ _output_ws_row_stride(_output_ws_col_stride * OutputTileColumns)
{
}
+MEMBERFN(void)::set_input(const void* const inptr)
+{
+ set_input(inptr, _n_channels);
+}
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-unsigned int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_window() const
+MEMBERFN(void)::set_input(const void* const inptr, const int ld_col)
{
- // Parallelise over blocks of channels.
- return iceildiv(_n_channels, CHANNEL_BLOCK);
+ set_input(inptr, _n_input_cols * ld_col, ld_col);
+}
+
+MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col)
+{
+ set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col);
+}
+
+MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col)
+{
+ _input = static_cast<const TIn *>(inptr);
+ _input_batch_stride = ld_batch;
+ _input_row_stride = ld_row;
+ _input_col_stride = ld_col;
+}
+
+MEMBERFN(void)::set_output(void* const outptr)
+{
+ set_output(outptr, _n_channels);
+}
+
+MEMBERFN(void)::set_output(void* const outptr, const int ld_col)
+{
+ set_output(outptr, _n_output_cols * ld_col, ld_col);
+}
+
+MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col)
+{
+ set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col);
+}
+
+MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col)
+{
+ _output = static_cast<TOut *>(outptr);
+ _output_batch_stride = ld_batch;
+ _output_row_stride = ld_row;
+ _output_col_stride = ld_col;
+}
+
+MEMBERFN(size_t)::get_packed_params_size(void) const
+{
+ return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
+}
+
+MEMBERFN(void)::set_packed_params_buffer(void *buffer)
+{
+ _packed_parameters = buffer;
+}
+
+MEMBERFN(void)::pack_params(const void *weights, const void *biases) const
+{
+ static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases);
+}
+
+MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const
+{
+ const unsigned int weight_col_stride = _n_channels;
+ const unsigned int weight_row_stride = KernelColumns * weight_col_stride;
+ static_cast<const Derived *>(this)->pack_params(
+ buffer, weights, weight_row_stride, weight_col_stride, biases
+ );
+}
+
+MEMBERFN(void)::pack_params(
+ void * const buffer,
+ const void * const weights,
+ const unsigned int weight_row_stride,
+ const unsigned int weight_col_stride,
+ const void * const biases
+) const
+{
+ static_cast<const Derived *>(this)->_pack_params(
+ buffer, weights, weight_row_stride, weight_col_stride, biases
+ );
+}
+
+MEMBERFN(void)::_pack_params(
+ void * const buffer,
+ const void * const weights,
+ const unsigned int weight_row_stride,
+ const unsigned int weight_col_stride,
+ const void * const biases
+) const
+{
+ // Default implementation
+ PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute(
+ _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases
+ );
+}
+
+MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
+{
+ return nthreads * (
+ _get_input_working_space_size() + _get_output_working_space_size()
+ );
}
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::set_offsets(int input_offset, int weights_offset)
+MEMBERFN(void)::set_working_space(void *buffer)
{
- _input_offset = input_offset;
- _weights_offset = weights_offset;
+ _working_space = buffer;
}
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run(
+MEMBERFN(size_t)::_get_input_working_space_size(void) const
+{
+ return sizeof(TIn) * inner_tile_rows * inner_tile_cols * _n_channels;
+}
+
+MEMBERFN(size_t)::_get_output_working_space_size(void) const
+{
+ return sizeof(TOut) * OutputTileRows * OutputTileColumns * _n_channels;
+}
+
+MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const
+{
+ return static_cast<uint8_t*>(_working_space) + threadid * (
+ _get_input_working_space_size() + _get_output_working_space_size()
+ );
+}
+
+MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const
+{
+ return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size();
+}
+
+MEMBERFN(unsigned int)::get_window() const
+{
+ // Parallelise over blocks of channels.
+ return iceildiv(_n_channels, CHANNEL_BLOCK);
+}
+
+MEMBERFN(void)::run(
const unsigned int start,
- const unsigned int stop
+ const unsigned int stop,
+ const unsigned int threadid
)
{
// Parallelise over blocks of channels
@@ -205,43 +297,38 @@ void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run(
const int output_row_bottom = (tile_i + 1)*output_tile_rows;
const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
+ // Get the offset into the packed parameters
+ const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) +
+ start_channel*(sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
+
// Process the row
process_tile_row(
+ threadid,
stop_channel - start_channel,
- _weights + start_channel, _weight_row_stride, _weight_col_stride,
- inptr_row + start_channel, _input_row_stride, _input_col_stride,
- outptr_row + start_channel, _output_row_stride, _output_col_stride,
+ params_ptr,
+ inptr_row + start_channel,
+ outptr_row + start_channel,
input_row_pad_top, input_pad_left, input_row_pad_bottom,
output_row_pad_bottom,
- _n_tile_cols, _n_input_cols, _n_output_cols,
- _input_offset, _weights_offset
+ _n_tile_cols, _n_input_cols, _n_output_cols
);
}
}
}
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row(
+MEMBERFN(void)::process_tile_row(
+ const unsigned int threadid,
const int n_channels,
- const TIn* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
+ const void* const packed_params,
const TIn* const inptr,
- const int in_row_stride,
- const int in_col_stride,
TOut* const outptr,
- const int out_row_stride,
- const int out_col_stride,
const int row_pad_in_top,
const int row_pad_in_left,
const int row_pad_in_bottom,
const int row_pad_out_bottom,
const int n_tiles,
const int n_input_cols,
- const int n_output_cols,
- const int input_offset,
- const int weights_offset
+ const int n_output_cols
)
{
constexpr int tile_overlap = kernel_cols - stride_cols;
@@ -261,264 +348,97 @@ void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row
// Get pointers into the inputs and outputs
const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
- const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*in_col_stride);
- TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride;
-
- // Apply the specific tile processing function
- const bool pad_top = row_pad_in_top > 0;
- const bool pad_left = t_pad_in_left > 0;
- const bool pad_bottom = row_pad_in_bottom || row_pad_out_bottom;
- const bool pad_right = t_pad_in_right || t_pad_out_right;
-
- const TileFn tilefn = [&] () {
- if (!pad_top && !pad_left && !pad_bottom && !pad_right)
- {
- // No padding
- return tilefn_unpadded;
- }
- else if (pad_top && !pad_left && !pad_bottom && !pad_right)
- {
- // Padding on the top only, subtract off the minimum expected padding in
- // order to index into the array of specialised methods.
- const int index = row_pad_in_top - min_in_pad_top;
- return tilefn_top[index];
- }
- else if (!pad_top && pad_left && !pad_bottom && !pad_right)
- {
- // Padding on the left only, subtract off the minimum expected padding in
- // order to index into the array of specialised methods.
- const int index = t_pad_in_left - min_in_pad_left;
- return tilefn_left[index];
- }
- else if (!pad_top && !pad_left && pad_bottom && !pad_right)
- {
- // Padding on the bottom only
- return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom];
- }
- else if (!pad_top && !pad_left && !pad_bottom && pad_right)
- {
- // Padding on the right only
- return tilefn_right[t_pad_in_right][t_pad_out_right];
- }
- else
- {
- // Otherwise use generic tile processing method.
- return tilefn_generic;
- }
- }();
-
- tilefn(
- n_channels,
- weights, weight_row_stride, weight_col_stride,
- inptr_col, in_row_stride, in_col_stride,
- outptr_col, out_row_stride, out_col_stride,
- row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,
- row_pad_out_bottom, t_pad_out_right, input_offset, weights_offset
+ const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride);
+ TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride;
+
+ // Process just this tile
+ process_tile(
+ threadid, n_channels, packed_params, inptr_col, outptr_col,
+ row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, // Input paddings
+ row_pad_out_bottom, t_pad_out_right // Output paddings
);
}
}
-
-// New templated struct used solely as a way to provide tile processing
-// specialisations.
-template <int OutputTileRows, int OutputTileCols,
- int KernelRows, int KernelCols,
- int StrideRows, int StrideCols,
- typename TIn, typename TOut>
-struct DepthwiseConvolutionImpl : public DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols, TIn, TOut
->
+MEMBERFN(TIn)::_input_padding_value(void) const
{
- typedef DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- TIn, TOut
- > DWC;
-
- /** Perform the depthwise convolution of a tile.
- *
- * @param[in] n_channels Number of channels.
- * @param[in] weights Pointer to Height x Width x Channels ordered weights.
- * @param[in] inptr Pointer to the top-left unpadded value of the tile.
- * @param[in] in_row_stride Stride between rows of the input tensor.
- * @param[in] in_col_stride Stride between columns of the input tensor.
- * @param[out] outptr Pointer to the top-left output value for the tile.
- * @param[in] out_row_stride Stride between rows of the output tensor.
- * @param[in] out_col_stride Stride between columns of the output tensor.
- *
- * The following parameters may be ignored if the function has been
- * specialised for specific padding constraints.
- *
- * @param[in] _in_pad_top Padding to apply to top of input tile.
- * @param[in] _in_pad_left Padding to apply to left of input tile.
- * @param[in] _in_pad_bottom Padding to apply to bottom of input tile.
- * @param[in] _in_pad_right Padding to apply to right of input tile.
- * @param[in] _out_pad_bottom Null cells at bottom of output tile.
- * @param[in] _out_pad_right Null cells at right of output tile.
- */
- template <
- bool Specialize=false, // Specialize (or not) the method
- int InPadTop=0, // If specialized, top padding
- int InPadLeft=0, // If specialized, left padding
- int InPadBottom=0, // If specialized, bottom padding
- int InPadRight=0, // If specialized, right padding
- int OutPadBottom=0, // If specialized, bottom output padding
- int OutPadRight=0 // If specialized, bottom right padding
- >
- static void process_tile(
- const int n_channels,
- const TIn* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const TIn* const inptr,
- const int in_row_stride,
- const int in_col_stride,
- TOut* const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int in_pad_top=0,
- const int in_pad_left=0,
- const int in_pad_bottom=0,
- const int in_pad_right=0,
- const int out_pad_bottom=0,
- const int out_pad_right=0,
- const int input_offset=0,
- const int weights_offset=0
- );
-};
-
+ return static_cast<TIn>(0);
+}
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-template <
- bool Specialize,
- int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
- int OutPadBottom, int OutPadRight
->
-void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile(
+MEMBERFN(void)::process_tile(
+ const unsigned int threadid,
const int n_channels,
- const TIn *__restrict__ const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const TIn *__restrict__ const inptr,
- const int in_row_stride,
- const int in_col_stride,
- TOut *__restrict__ const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int _in_pad_top,
- const int _in_pad_left,
- const int _in_pad_bottom,
- const int _in_pad_right,
- const int _out_pad_bottom,
- const int _out_pad_right,
- const int _input_offset,
- const int _weights_offset
+ const void* const packed_params,
+ const TIn* const inptr,
+ TOut* const outptr,
+ const int pad_in_top,
+ const int pad_in_left,
+ const int pad_in_bottom,
+ const int pad_in_right,
+ const int pad_out_bottom,
+ const int pad_out_right
)
{
- constexpr auto inner_tile_rows = DWC::inner_tile_rows;
- constexpr auto inner_tile_cols = DWC::inner_tile_cols;
- constexpr auto kernel_rows = DWC::kernel_rows;
- constexpr auto kernel_cols = DWC::kernel_cols;
- constexpr auto output_tile_rows = DWC::output_tile_rows;
- constexpr auto output_tile_cols = DWC::output_tile_cols;
- constexpr auto stride_rows = DWC::stride_rows;
- constexpr auto stride_cols = DWC::stride_cols;
-
- // Extract parameters
- const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
- const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
- const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
- const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
- const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
- const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
-
- // Compute valid ranges of the tile
- const int in_cells_i = inner_tile_rows - in_pad_bottom;
- const int in_cells_j = inner_tile_cols - in_pad_right;
- const int out_cells_i = output_tile_rows - out_pad_bottom;
- const int out_cells_j = output_tile_cols - out_pad_right;
-
- // Instantiate pointers
- const TIn* __restrict__ inptr_base = inptr;
- const TIn* __restrict__ wptr_base = weights;
- TOut* __restrict__ outptr_base = outptr;
-
- // Perform the depthwise convolution
- int channels_remaining = n_channels;
- for (; channels_remaining; channels_remaining--)
+ const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right;
+ const bool pad_output = pad_out_bottom || pad_out_right;
+
+ if (pad_input)
{
- // Load input tile
- TIn u[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
- {
- const TIn* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
- for (int j = 0; j < inner_tile_cols; j++)
- {
- if (i < in_pad_top || in_cells_i <= i ||
- j < in_pad_left || in_cells_j <= j)
- {
- u[i][j] = static_cast<TIn>(0);
- }
- else
- {
- u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
- }
- }
- }
- inptr_base++;
+ // Copy the input into the temporary buffer, applying padding
+ padding::copy_and_pad_tile<TIn>(
+ inner_tile_rows, inner_tile_cols, n_channels,
+ inptr, _input_row_stride, _input_col_stride,
+ static_cast<TIn *>(_get_input_working_space(threadid)), _input_ws_row_stride, _input_ws_col_stride,
+ pad_in_top, pad_in_left, pad_in_bottom, pad_in_right,
+ static_cast<Derived *>(this)->_input_padding_value()
+ );
+ }
- // Load weights tile
- TIn w[kernel_rows][kernel_cols];
- for (int i = 0; i < kernel_rows; i++)
- {
- const TIn* const wptr_row = wptr_base + i*weight_row_stride;
- for (int j = 0; j < kernel_cols; j++)
- {
- w[i][j] = *(wptr_row + j*weight_col_stride);
- }
- }
- wptr_base++;
+ // Execute the kernel
+ const TIn * const tile_inptr = !pad_input ? inptr : static_cast<const TIn *>(_get_input_working_space(threadid));
+ const int in_row_stride = !pad_input ? _input_row_stride : _input_ws_row_stride;
+ const int in_col_stride = !pad_input ? _input_col_stride : _input_ws_col_stride;
- // Perform the convolution
- TOut v[output_tile_rows][output_tile_cols];
- for (int out_i = 0; out_i < out_cells_i; out_i++)
- {
- for (int out_j = 0; out_j < out_cells_j; out_j++)
- {
- // Clear the accumulator
- v[out_i][out_j] = static_cast<TOut>(0);
-
- // Base co-ordinate
- const int base_i = out_i * stride_rows;
- const int base_j = out_j * stride_cols;
-
- // Fill the accumulator
- for (int in_i = 0; in_i < kernel_rows; in_i++)
- {
- const int i = base_i + in_i;
- for (int in_j = 0; in_j < kernel_cols; in_j++)
- {
- const int j = base_j + in_j;
- v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- }
- }
- }
- }
+ TOut * const tile_outptr = !pad_output ? outptr : static_cast<TOut *>(_get_output_working_space(threadid));
+ const int out_row_stride = !pad_output ? _output_row_stride : _output_ws_row_stride;
+ const int out_col_stride = !pad_output ? _output_col_stride : _output_ws_col_stride;
- // Store the output tile
- for (int i = 0; i < out_cells_i; i++)
- {
- TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride;
- for (int j = 0; j < out_cells_j; j++)
- {
- *(outptr_row + j*out_col_stride) = v[i][j];
- }
- }
- outptr_base++;
+ Derived * dthis = static_cast<Derived *>(this);
+
+ switch(_activation)
+ {
+ case ActivationFunction::ReLU:
+ dthis->template execute_tile<ActivationFunction::ReLU>(
+ n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
+ );
+ break;
+ case ActivationFunction::ReLU6:
+ dthis->template execute_tile<ActivationFunction::ReLU6>(
+ n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
+ );
+ break;
+ default:
+ dthis->template execute_tile<ActivationFunction::None>(
+ n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
+ );
+ break;
}
+
+ if (pad_output)
+ {
+ // Copy the output from the temporary buffer, removing unnecessary values
+ padding::CopyCropped<OutputTileRows, OutputTileColumns>::execute(
+ n_channels * sizeof(TOut),
+ _get_output_working_space(threadid), _output_ws_row_stride * sizeof(TOut), _output_ws_col_stride * sizeof(TOut),
+ outptr, _output_row_stride * sizeof(TOut), _output_col_stride * sizeof(TOut),
+ 0, 0, pad_out_bottom, pad_out_right
+ );
+ }
+}
+
+MEMBERFN(int)::n_channels(void) const
+{
+ return _n_channels;
}
} // namespace depthwise
diff --git a/arm_compute/core/utils/misc/InfoHelpers.h b/arm_compute/core/utils/misc/InfoHelpers.h
new file mode 100644
index 0000000000..704e178292
--- /dev/null
+++ b/arm_compute/core/utils/misc/InfoHelpers.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MISC_INFO_HELPERS_H__
+#define __ARM_COMPUTE_MISC_INFO_HELPERS_H__
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace utils
+{
+namespace info_helpers
+{
+/** Checks if activation information correspond to a relu activation function
+ *
+ * @param[in] activation_info Activation metadata
+ *
+ * @return True if activation metadata correspond to a relu activation else false
+ */
+inline bool is_relu(ActivationLayerInfo activation_info)
+{
+ return activation_info.enabled() && activation_info.activation() == ActivationLayerInfo::ActivationFunction::RELU;
+}
+
+/** Checks if activation information correspond to a relu6 activation function
+ *
+ * @param[in] activation_info Activation metadata
+ *
+ * @return True if activation metadata correspond to a relu6 activation else false
+ */
+inline bool is_relu6(ActivationLayerInfo activation_info)
+{
+ return activation_info.enabled()
+ && activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ && activation_info.a() == 6.f && activation_info.b() == 0.f;
+}
+} // namespace info_helpers
+} // namespace utils
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_MISC_INFO_HELPERS_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index e2fe11ea7f..28f0560e93 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,6 +37,7 @@
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
#include "arm_compute/runtime/Tensor.h"
namespace arm_compute
@@ -53,7 +54,15 @@ class NEDepthwiseConvolutionLayer3x3 : public IFunction
{
public:
/** Default constructor */
- NEDepthwiseConvolutionLayer3x3();
+ NEDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEDepthwiseConvolutionLayer3x3(const NEDepthwiseConvolutionLayer3x3 &) = delete;
+ /** Default move constructor */
+ NEDepthwiseConvolutionLayer3x3(NEDepthwiseConvolutionLayer3x3 &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEDepthwiseConvolutionLayer3x3 &operator=(const NEDepthwiseConvolutionLayer3x3 &) = delete;
+ /** Default move assignment operator */
+ NEDepthwiseConvolutionLayer3x3 &operator=(NEDepthwiseConvolutionLayer3x3 &&) = default;
/** Initialize the function's source, destination, kernels and border_size.
*
* @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
@@ -86,9 +95,44 @@ public:
// Inherited methods overriden:
void run() override;
+ void prepare() override;
private:
+ /** Configure the kernels/functions for the generic pipeline.
+ *
+ * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+ * @param[in] biases (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input.
+ * @param[out] output Destination tensor. Data type supported: same as @p input.
+ * @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+ * @param[in] act_info Activation layer information in case of a fused activation.
+ */
+ void configure_generic(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info);
+ /** Configure the kernels/functions for the optimized pipeline.
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+ * @param[in] biases (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input.
+ * @param[out] output Destination tensor. Data type supported: same as @p input.
+ * @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+ * @param[in] act_info Activation layer information in case of a fused activation.
+ */
+ void configure_optimized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info);
+ /** Run generic kernel */
+ void run_generic();
+ /** Run optimized function */
+ void run_optimized();
+
+private:
+ MemoryGroup _memory_group;
NEDepthwiseConvolutionLayer3x3Kernel _dwc_kernel;
+ NEDepthwiseConvolutionAssemblyDispatch _dwc_optimized_func;
NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
NEFillBorderKernel _border_handler;
NEPermute _permute_input;
@@ -99,14 +143,14 @@ private:
Tensor _permuted_input;
Tensor _permuted_weights;
Tensor _permuted_output;
+ const ITensor *_original_weights;
bool _has_bias;
bool _is_quantized;
bool _is_optimized;
- bool _are_weights_reshaped;
bool _is_nchw;
- bool _is_first_run;
bool _permute;
bool _is_activationlayer_enabled;
+ bool _is_prepared;
};
/** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernels:
diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
new file mode 100644
index 0000000000..df8f29d2c7
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__
+#define __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+
+namespace arm_compute
+{
+/** Depthwise convolution assembly kernel glue */
+class NEDepthwiseConvolutionAssemblyDispatch : public IFunction
+{
+public:
+ /** Default constructor
+ *
+ * @param[in,out] memory_manager Memory manager to use
+ */
+ NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEDepthwiseConvolutionAssemblyDispatch(const NEDepthwiseConvolutionAssemblyDispatch &) = delete;
+ /** Default move constructor */
+ NEDepthwiseConvolutionAssemblyDispatch(NEDepthwiseConvolutionAssemblyDispatch &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEDepthwiseConvolutionAssemblyDispatch &operator=(const NEDepthwiseConvolutionAssemblyDispatch &) = delete;
+ /** Default move assignment operator */
+ NEDepthwiseConvolutionAssemblyDispatch &operator=(NEDepthwiseConvolutionAssemblyDispatch &&) = default;
+ /** Initialize the function's source, destination, kernels and border_size.
+ *
+ * @note Supports only NHWC format
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+ * @param[in] bias (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input.
+ * @param[out] output Destination tensor. Data type supported: same as @p input.
+ * @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(const ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
+ const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionAssemblyDispatch
+ *
+ * @note Supports only NHWC format
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+ * @param[in] bias (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input.
+ * @param[out] output Destination tensor. Data type supported: same as @p input.
+ * @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ *
+ * @return An error status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output,
+ const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Check if the optimized kernel can be used for the given kernel sizes and strides
+ *
+ * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
+ *
+ * @param[in] input Input tensor info.
+ * @param[in] weights Weights tensor info.
+ * @param[in] conv_info Convolution layer metadata.
+ * @param[in] depth_multiplier (Optional) Depth multiplier to be used.
+ *
+ * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
+ */
+ static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, PadStrideInfo conv_info, unsigned int depth_multiplier = 1);
+
+ // Inherited methods overridden:
+ void run() override;
+ void prepare() override;
+
+private:
+ MemoryGroup _memory_group;
+ const ITensor *_input;
+ const ITensor *_weights;
+ const ITensor *_bias;
+ ITensor *_output;
+ Tensor _packed_weights;
+ Tensor _workspace;
+ bool _is_prepared;
+ std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel;
+ NEDepthwiseConvolutionAssemblyKernelWrapper _dwc_acl_kernel;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__ */