aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-06-24 14:56:34 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-07-09 09:31:37 +0000
commit30271c779c36a2abe6995c4454674d92bbc1f91f (patch)
tree531257ff87cf2cb8d6f3b8da0abe3e6cb77a2a0e
parent30dbeef2f46bdd6fe05d25dfa27cb4b2359dced3 (diff)
downloadComputeLibrary-30271c779c36a2abe6995c4454674d92bbc1f91f.tar.gz
COMPMID-2156: Optimized dilated convolution for NEON.
Change-Id: I3a8abe8cc9637c8983d9bd69dcbaee1a15eac8d0 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/1492 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
-rw-r--r--SConscript7
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp41
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp156
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp38
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp88
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp32
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp295
-rw-r--r--arm_compute/graph/backends/FunctionHelpers.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h119
-rw-r--r--arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h39
-rw-r--r--docs/00_introduction.dox1
-rw-r--r--src/core/NEON/kernels/convolution/common/padding.cpp4
-rw-r--r--src/core/NEON/kernels/convolution/common/qasymm8.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp32
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp142
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp29
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp28
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp997
-rw-r--r--src/graph/backends/CL/CLFunctionsFactory.cpp4
-rw-r--r--src/graph/backends/NEON/NEFunctionFactory.cpp4
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp327
-rw-r--r--src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp127
-rw-r--r--tests/datasets/DepthwiseConvolutionLayerDataset.h1
-rw-r--r--tests/validation/NEON/DepthwiseConvolutionLayer.cpp8
25 files changed, 2192 insertions, 333 deletions
diff --git a/SConscript b/SConscript
index 45c4ccc414..a170a4a7c1 100644
--- a/SConscript
+++ b/SConscript
@@ -205,10 +205,13 @@ if env['neon']:
core_files += Glob('src/core/NEON/kernels/arm_gemm/*.cpp')
- # build winograd sources for either v7a / v8a
+ # build winograd/depthwise sources for either v7a / v8a
core_files += Glob('src/core/NEON/kernels/convolution/*/*.cpp')
core_files += Glob('src/core/NEON/kernels/convolution/winograd/*/*.cpp')
- arm_compute_env.Append(CPPPATH = ["arm_compute/core/NEON/kernels/convolution/winograd/","arm_compute/core/NEON/kernels/convolution/common/" , "arm_compute/core/NEON/kernels/assembly/"])
+ arm_compute_env.Append(CPPPATH = ["arm_compute/core/NEON/kernels/convolution/common/",
+ "arm_compute/core/NEON/kernels/convolution/winograd/",
+ "arm_compute/core/NEON/kernels/convolution/depthwise/",
+ "arm_compute/core/NEON/kernels/assembly/"])
graph_files += Glob('src/graph/backends/NEON/*.cpp')
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
index e0cb616a3d..a4a833d90a 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
@@ -25,8 +25,8 @@
#pragma once
#include <arm_neon.h>
-#include "arm_compute/core/NEON/kernels/convolution/common/activation.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
+#include "activation.hpp"
+#include "padding.hpp"
namespace depthwise
{
@@ -127,6 +127,23 @@ class DepthwiseConvolutionBase : public IDepthwiseConvolution
unsigned int padding_right
);
+ /** Create a new depthwise convolution engine.
+ *
+ * @param[in] n_batches Number of batches tensors.
+ * @param[in] n_input_rows Number of rows in input tensor.
+ * @param[in] n_input_cols Number of columns in input tensor.
+ * @param[in] n_channels Number of channels in input and output tensors.
+ */
+ DepthwiseConvolutionBase(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int n_output_rows, int n_output_cols,
+ nck::ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
// Cannot copy or move a DepthwiseConvolution.
DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
@@ -417,6 +434,16 @@ class DepthwiseConvolution<
unsigned int padding_right
);
+ DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int n_output_rows, int n_output_cols,
+ nck::ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
protected:
template <nck::ActivationFunction Activation>
void execute_tile(
@@ -488,6 +515,16 @@ class DepthwiseConvolution<
unsigned int padding_right
);
+ DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int n_output_rows, int n_output_cols,
+ nck::ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
protected:
template <nck::ActivationFunction Activation>
void execute_tile(
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
new file mode 100644
index 0000000000..e0d7f0c7f1
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <deque>
+#include <functional>
+#include <memory>
+
+#include "depthwise.hpp"
+
+namespace depthwise
+{
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols,
+ typename TIn, typename TBias, typename TOut
+>
+class DilatedDepthwiseConvolution : public IDepthwiseConvolution
+{
+ public:
+ /** Create a new dilated depthwise convolution engine.
+ */
+ DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor,
+ nck::ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
+ /** Create a new dilated depthwise convolution engine.
+ */
+ DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor, int n_output_rows, int n_output_cols,
+ nck::ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
+ // Cannot copy or move a DilatedDepthwiseConvolution.
+ DilatedDepthwiseConvolution(DilatedDepthwiseConvolution&) = delete;
+ DilatedDepthwiseConvolution operator=(DilatedDepthwiseConvolution&) = delete;
+
+ /* Set input tensor and stride. */
+ void set_input(const void *inptr) override;
+ void set_input(const void *inptr, int column_stride) override;
+ void set_input(const void *inptr, int row_stride, int column_stride) override;
+ void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
+
+ /* Set output tensor and stride. */
+ void set_output(void *outptr) override;
+ void set_output(void *outptr, int column_stride) override;
+ void set_output(void *outptr, int row_stride, int column_stride) override;
+ void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
+
+ static int get_output_size(
+ int dim_size,
+ unsigned int padding_before,
+ unsigned int padding_after,
+ int dilation_factor
+ );
+
+ int output_size(
+ int dim_size, unsigned int padding_before, unsigned int padding_after
+ ) const override;
+
+ /* Weights and biases are re-ordered to improve memory access patterns. Use
+ * these methods to determine the size of the re-pack buffer and to set the
+ * address (and implicitly reorder the weights and biases into) the buffer.
+ */
+ size_t get_packed_params_size(void) const override;
+ void set_packed_params_buffer(void *) override;
+
+ void pack_params(const void *weights, const void *biases=nullptr) const override;
+ void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const override;
+ void pack_params(
+ void *buffer,
+ const void* weights,
+ unsigned int weight_row_stride,
+ unsigned int weight_col_stride,
+ const void *biases=nullptr
+ ) const override;
+
+ /* Working space is used to pad tensors on the fly. Before running any
+ * inference check the amount of space required, allocate and provide a
+ * pointer to the convolution engine.
+ */
+ size_t get_working_space_size(unsigned int nthreads=1) const override;
+ void set_working_space(void *) override;
+
+ unsigned int get_window(void) const override;
+ void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
+
+ protected:
+ /** Protected constructor which also accepts a function to construct a new
+ * subconvolution
+ */
+ DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor, int n_output_rows, int n_output_cols,
+ nck::ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right,
+ std::function<IDepthwiseConvolution *(int, int, int, int, int, int, nck::ActivationFunction, unsigned int, unsigned int, unsigned int, unsigned int)> subconvfn
+ );
+
+ const int _dilation_factor;
+ const int _n_input_rows, _n_input_cols, _n_channels;
+ const int _padding_top, _padding_left;
+ const int _n_output_rows, _n_output_cols;
+
+ /* Dilated depthwise convolution is performed through repeated calls to
+ * non-dilated convolutions. If the dilation factor is $n$, then we perform
+ * $(n + 1)^2$ depthwise convolutions.
+ */
+ using BaseDepthwise = DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ TIn, TBias, TOut
+ >;
+ std::deque<std::deque<std::unique_ptr<IDepthwiseConvolution>>> _convs;
+};
+
+} // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
index e34023faf1..b65ced6f35 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,8 @@
*/
#pragma once
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp"
+#include "depthwise.hpp"
+#include "qasymm8.hpp"
namespace depthwise
{
@@ -70,6 +70,33 @@ class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
QAsymm8DepthwiseConvolution(
int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int n_output_rows, int n_output_cols,
+ nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params& weight_quantisation,
+ const qasymm8::QAsymm8Params& input_quantisation,
+ const qasymm8::QAsymm8Params& output_quantisation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
+ QAsymm8DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params& weight_quantisation,
+ const qasymm8::QAsymm8Params& input_quantisation,
+ const qasymm8::QAsymm8Params& output_quantisation,
+ const qasymm8::QAsymm8RescaleParams& rescale_parameters,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
+ QAsymm8DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int n_output_rows, int n_output_cols,
nck::ActivationFunction activation,
const qasymm8::QAsymm8Params& weight_quantisation,
const qasymm8::QAsymm8Params& input_quantisation,
@@ -82,6 +109,11 @@ class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
);
protected:
+ static nck::ActivationFunction get_activation_fn(
+ nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params& output_quantisation
+ );
+
uint8_t _input_padding_value(void) const;
void _pack_params(
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
new file mode 100644
index 0000000000..cf1c6f581f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include "depthwise_dilated.hpp"
+#include "depthwise_quantized.hpp"
+
+namespace depthwise {
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols>
+class QAsymm8DilatedDepthwiseConvolution
+ : public DilatedDepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
+ StrideCols, uint8_t, int32_t, uint8_t> {
+public:
+ /** Create a new dilated depthwise convolution engine.
+ */
+ QAsymm8DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor, nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params &weight_quantisation,
+ const qasymm8::QAsymm8Params &input_quantisation,
+ const qasymm8::QAsymm8Params &output_quantisation,
+ unsigned int padding_top, unsigned int padding_left,
+ unsigned int padding_bottom, unsigned int padding_right);
+
+ /** Create a new dilated depthwise convolution engine.
+ */
+ QAsymm8DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor, int n_output_rows, int n_output_cols,
+ nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params &weight_quantisation,
+ const qasymm8::QAsymm8Params &input_quantisation,
+ const qasymm8::QAsymm8Params &output_quantisation,
+ unsigned int padding_top, unsigned int padding_left,
+ unsigned int padding_bottom, unsigned int padding_right);
+
+ /** Create a new dilated depthwise convolution engine.
+ */
+ QAsymm8DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor, nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params &weight_quantisation,
+ const qasymm8::QAsymm8Params &input_quantisation,
+ const qasymm8::QAsymm8Params &output_quantisation,
+ const qasymm8::QAsymm8RescaleParams &rescale_parameters,
+ unsigned int padding_top, unsigned int padding_left,
+ unsigned int padding_bottom, unsigned int padding_right);
+
+ /** Create a new dilated depthwise convolution engine.
+ */
+ QAsymm8DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor, int n_output_rows, int n_output_cols,
+ nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params &weight_quantisation,
+ const qasymm8::QAsymm8Params &input_quantisation,
+ const qasymm8::QAsymm8Params &output_quantisation,
+ const qasymm8::QAsymm8RescaleParams& rescale_parameters,
+ unsigned int padding_top, unsigned int padding_left,
+ unsigned int padding_bottom, unsigned int padding_right);
+};
+
+} // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
index 493b2991dc..b102a24250 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
@@ -32,9 +32,9 @@
#include <algorithm>
#include <cstdint>
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
+#include "depthwise.hpp"
+#include "padding.hpp"
+#include "utils.hpp"
#pragma once
@@ -95,6 +95,28 @@ MEMBERFN()::DepthwiseConvolutionBase(
const unsigned int padding_left,
const unsigned int padding_bottom,
const unsigned int padding_right
+) : DepthwiseConvolutionBase(
+ n_batches, n_input_rows, n_input_cols, n_channels,
+ get_output_size(n_input_rows, padding_top, padding_bottom),
+ get_output_size(n_input_cols, padding_left, padding_right),
+ activation,
+ padding_top, padding_left, padding_bottom, padding_right
+ )
+{
+}
+
+MEMBERFN()::DepthwiseConvolutionBase(
+ const int n_batches,
+ const int n_input_rows,
+ const int n_input_cols,
+ const int n_channels,
+ const int n_output_rows,
+ const int n_output_cols,
+ ActivationFunction activation,
+ const unsigned int padding_top,
+ const unsigned int padding_left,
+ const unsigned int padding_bottom,
+ const unsigned int padding_right
) : _input(nullptr), _output(nullptr),
_packed_parameters(nullptr),
_working_space(nullptr),
@@ -102,8 +124,8 @@ MEMBERFN()::DepthwiseConvolutionBase(
_n_input_rows(n_input_rows),
_n_input_cols(n_input_cols),
_n_channels(n_channels),
- _n_output_rows(get_output_size(n_input_rows, padding_top, padding_bottom)),
- _n_output_cols(get_output_size(n_input_cols, padding_left, padding_right)),
+ _n_output_rows(n_output_rows),
+ _n_output_cols(n_output_cols),
_n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
_n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
_padding_top(padding_top),
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
new file mode 100644
index 0000000000..2ef965ba4b
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_dilated.hpp"
+#include "utils.hpp"
+
+#define MEMBERFN(TOUT) \
+ template <unsigned int OutputTileRows, unsigned int OutputTileColumns, \
+ unsigned int KernelRows, unsigned int KernelColumns, \
+ unsigned int StrideRows, unsigned int StrideColumns, typename TIn, \
+ typename TBias, typename TOut> \
+ TOUT DilatedDepthwiseConvolution<OutputTileRows, OutputTileColumns, \
+ KernelRows, KernelColumns, StrideRows, \
+ StrideColumns, TIn, TBias, TOut>
+
+namespace depthwise {
+
+MEMBERFN()
+::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows,
+ const int n_input_cols, const int n_channels,
+ const int dilation_factor,
+ nck::ActivationFunction activation,
+ const unsigned int padding_top,
+ const unsigned int padding_left,
+ const unsigned int padding_bottom,
+ const unsigned int padding_right)
+ : DilatedDepthwiseConvolution(
+ n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+ DilatedDepthwiseConvolution::get_output_size(
+ n_input_rows, padding_top, padding_bottom, dilation_factor),
+ DilatedDepthwiseConvolution::get_output_size(
+ n_input_cols, padding_left, padding_right, dilation_factor),
+ activation, padding_top, padding_left, padding_bottom,
+ padding_right) {}
+
+MEMBERFN()
+::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows,
+ const int n_input_cols, const int n_channels,
+ const int dilation_factor,
+ const int n_output_rows, const int n_output_cols,
+ nck::ActivationFunction activation,
+ const unsigned int padding_top,
+ const unsigned int padding_left,
+ const unsigned int, // padding_bottom
+ const unsigned int // padding_right
+ )
+ : DilatedDepthwiseConvolution(
+ n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+ n_output_rows, n_output_cols, activation, padding_top, padding_left,
+ 0, 0,
+ // Function which creates a new (standard) depthwise convolution
+ [](const int n_batches, const int n_input_rows,
+ const int n_input_cols, const int n_channels,
+ const int n_output_rows, const int n_output_cols,
+ const nck::ActivationFunction activation,
+ const unsigned int padding_top, const unsigned int padding_left,
+ const unsigned int padding_bottom,
+ const unsigned int padding_right) -> IDepthwiseConvolution * {
+ return new DepthwiseConvolution<
+ OutputTileRows, OutputTileColumns, KernelRows, KernelColumns,
+ StrideRows, StrideColumns, TIn, TBias, TOut>(
+ n_batches, n_input_rows, n_input_cols, n_channels,
+ n_output_rows, n_output_cols, activation, padding_top,
+ padding_left, padding_bottom, padding_right);
+ }) {}
+
+MEMBERFN()
+::DilatedDepthwiseConvolution(
+ const int n_batches, const int n_input_rows, const int n_input_cols,
+ const int n_channels, const int dilation_factor, const int n_output_rows,
+ const int n_output_cols, nck::ActivationFunction activation,
+ const unsigned int padding_top, const unsigned int padding_left,
+ const unsigned int, // padding_bottom
+ const unsigned int, // padding_right
+ std::function<IDepthwiseConvolution *(
+ int, int, int, int, int, int, nck::ActivationFunction, unsigned int,
+ unsigned int, unsigned int, unsigned int)>
+ subconvfn // Function to create a new convolution
+ )
+ : _dilation_factor(dilation_factor), _n_input_rows(n_input_rows),
+ _n_input_cols(n_input_cols), _n_channels(n_channels),
+ _padding_top(static_cast<int>(padding_top)),
+ _padding_left(static_cast<int>(padding_left)),
+ _n_output_rows(n_output_rows), _n_output_cols(n_output_cols),
+ _convs(_dilation_factor) {
+ // Instantiate the base convolutions
+ for (int i = 0; i < _dilation_factor; i++) {
+ // Compute properties of this row of base convolutions
+ const int row_top =
+ i * StrideRows - _padding_top; // -ve values are in the padding
+ const int row_pad_top =
+ row_top < 0 ? iceildiv(-row_top, dilation_factor) : 0;
+
+ const int _n_input_rows = iceildiv(n_input_rows - i, dilation_factor);
+ const int _n_output_rows = iceildiv(n_output_rows - i, dilation_factor);
+
+ for (int j = 0; j < _dilation_factor; j++) {
+ // Compute properties of the base convolution
+ const int col_left =
+ j * StrideColumns - padding_left; // -ve values are in the padding
+ const int col_pad_left =
+ col_left < 0 ? iceildiv(-col_left, dilation_factor) : 0;
+
+ const int _n_input_cols = iceildiv(n_input_cols - j, dilation_factor);
+ const int _n_output_cols = iceildiv(n_output_cols - j, dilation_factor);
+
+ // Create new depthwise convolution engine and include it in the vector
+ // of engines. The new depthwise convolution engine is created by calling
+ // the delegate function we received as an argument.
+ _convs[i].emplace_back(subconvfn(
+ n_batches, _n_input_rows, _n_input_cols, n_channels, _n_output_rows,
+ _n_output_cols, activation,
+ // Note: since we have computed the output tensor size we don't need
+ // to explicitly provide bottom and right padding values to the
+ // depthwise convolution.
+ row_pad_top, col_pad_left, 0, 0));
+ }
+ }
+}
+
+MEMBERFN(void)::set_input(const void *const inptr) {
+ set_input(inptr, _n_channels);
+}
+
+MEMBERFN(void)::set_input(const void *const inptr, const int ldcol) {
+ set_input(inptr, _n_input_cols * ldcol, ldcol);
+}
+
+MEMBERFN(void)
+::set_input(const void *const inptr, const int ldrow, const int ldcol) {
+ set_input(inptr, _n_input_rows * ldrow, ldrow, ldcol);
+}
+
+MEMBERFN(void)
+::set_input(const void *const inptr, const int ldbatch, const int ldrow,
+ const int ldcol) {
+ // Compute dilated strides
+ const int ldrow_dilated = ldrow * _dilation_factor;
+ const int ldcol_dilated = ldcol * _dilation_factor;
+
+ // Pass input parameters on to base convolutions
+ for (int i = 0; i < _dilation_factor; i++) {
+ const int top_pos =
+ i * StrideRows - _padding_top +
+ ((static_cast<int>(i * StrideRows) < _padding_top)
+ ? iceildiv(_padding_top - i * StrideRows, _dilation_factor) *
+ _dilation_factor
+ : 0);
+ const TIn *const inptr_i =
+ static_cast<const TIn *>(inptr) + top_pos * ldrow;
+
+ for (int j = 0; j < _dilation_factor; j++) {
+ int left_pos = j * StrideColumns - _padding_left;
+ while (left_pos < 0)
+ left_pos += _dilation_factor;
+
+ // Modify the pointer to point to the first element of the dilated input
+ // tensor, then set the input for this convolution engine.
+ const void *const inptr_ij = inptr_i + left_pos * ldcol;
+ _convs[i][j]->set_input(inptr_ij, ldbatch, ldrow_dilated, ldcol_dilated);
+ }
+ }
+}
+
+MEMBERFN(void)::set_output(void *const outptr) {
+ set_output(outptr, _n_channels);
+}
+
+MEMBERFN(void)::set_output(void *const outptr, const int ldcol) {
+ set_output(outptr, _n_output_cols * ldcol, ldcol);
+}
+
+MEMBERFN(void)
+::set_output(void *const outptr, const int ldrow, const int ldcol) {
+ set_output(outptr, _n_output_rows * ldrow, ldrow, ldcol);
+}
+
+MEMBERFN(void)
+::set_output(void *const outptr, const int ldbatch, const int ldrow,
+ const int ldcol) {
+ // Compute dilated strides
+ const int ldrow_dilated = ldrow * _dilation_factor;
+ const int ldcol_dilated = ldcol * _dilation_factor;
+
+ // Pass input parameters on to base convolutions
+ for (int i = 0; i < _dilation_factor; i++) {
+ for (int j = 0; j < _dilation_factor; j++) {
+ // Modify the pointer to point to the first element of the dilated input
+ // tensor, then set the input for this convolution engine.
+ void *const outptr_ij =
+ static_cast<TOut *>(outptr) + i * ldrow + j * ldcol;
+ _convs[i][j]->set_output(outptr_ij, ldbatch, ldrow_dilated,
+ ldcol_dilated);
+ }
+ }
+}
+
+MEMBERFN(int)
+::get_output_size(const int dim_size, const unsigned int padding_before,
+ const unsigned int padding_after, const int dilation_factor) {
+ const int input_size =
+ dim_size + static_cast<int>(padding_before + padding_after);
+ const int window_size = (KernelRows - 1) * dilation_factor + 1;
+ return iceildiv(input_size - window_size + 1, StrideRows);
+}
+
+MEMBERFN(int)
+::output_size(const int dim_size, const unsigned int padding_before,
+ const unsigned int padding_after) const {
+ return get_output_size(dim_size, padding_before, padding_after,
+ _dilation_factor);
+}
+
+MEMBERFN(size_t)::get_packed_params_size(void) const {
+ return _convs[0][0]->get_packed_params_size();
+}
+
+MEMBERFN(void)::set_packed_params_buffer(void *buffer) {
+ // Set the buffer for all convolution engines
+ for (auto &&row : _convs) {
+ for (auto &&conv : row) {
+ conv->set_packed_params_buffer(buffer);
+ }
+ }
+}
+
+MEMBERFN(void)
+::pack_params(const void *const weights, const void *const biases) const {
+ _convs[0][0]->pack_params(weights, biases);
+}
+
+MEMBERFN(void)
+::pack_params(void *const buffer, const void *const weights,
+ const void *const biases) const {
+ _convs[0][0]->pack_params(buffer, weights, biases);
+}
+
+MEMBERFN(void)
+::pack_params(void *const buffer, const void *const weights,
+ const unsigned int ldrow, const unsigned int ldcol,
+ const void *const biases) const {
+ _convs[0][0]->pack_params(buffer, weights, ldrow, ldcol, biases);
+}
+
+MEMBERFN(size_t)::get_working_space_size(unsigned int nthreads) const {
+ return _convs[0][0]->get_working_space_size(nthreads);
+}
+
+MEMBERFN(void)::set_working_space(void *const ws) {
+ // Use the same working space set for all contained depthwise engines.
+ for (auto &&row : _convs) {
+ for (auto &&conv : row) {
+ conv->set_working_space(ws);
+ }
+ }
+}
+
+MEMBERFN(unsigned int)::get_window(void) const {
+ return _convs[0][0]->get_window();
+}
+
+MEMBERFN(void)
+::run(const unsigned int start, const unsigned int stop,
+ const unsigned int threadid) {
+ // Run each contained convolution in turn
+ for (auto &&row : _convs) {
+ for (auto &&conv : row) {
+ conv->run(start, stop, threadid);
+ }
+ }
+}
+
+} // namespace depthwise
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 785f6dc3b9..fbf8d17f67 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -523,7 +523,7 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvoluti
std::string func_name;
if(dwc_algorithm == DepthwiseConvolutionMethod::Optimized3x3)
{
- std::tie(func, func_name) = create_named_function<typename DepthwiseConvolutionLayerFunctions::DepthwiseConvolutionLayer3x3>(
+ std::tie(func, func_name) = create_named_function<typename DepthwiseConvolutionLayerFunctions::OptimizedDepthwiseConvolutionLayer>(
std::string("DepthwiseConvolutionLayer3x3"),
input, weights, biases, output, conv_info, depth_multiplier, fused_act);
}
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 396e2368c3..81bf53ace6 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -42,6 +42,7 @@
namespace arm_compute
{
+// Forward declarations
class ITensor;
/** Basic function to execute a depthwise convolution for kernel size 3x3xC. This function calls the following NEON kernels:
@@ -157,6 +158,124 @@ private:
bool _is_prepared;
};
+/** Basic function to execute optimized depthwise convolution routines. This function calls the following NEON kernels:
+ *
+ * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
+ *
+ * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
+ * -# @ref NEDepthwiseConvolutionLayer3x3Kernel if 3x3 and no assembly kernel implementation is present
+ * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+ * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
+ * -# @ref NEActivationLayer if fused activation is required
+ *
+ */
+class NEDepthwiseConvolutionLayerOptimized : public IFunction
+{
+public:
+ /** Default constructor */
+ NEDepthwiseConvolutionLayerOptimized(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEDepthwiseConvolutionLayerOptimized(const NEDepthwiseConvolutionLayerOptimized &) = delete;
+ /** Default move constructor */
+ NEDepthwiseConvolutionLayerOptimized(NEDepthwiseConvolutionLayerOptimized &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEDepthwiseConvolutionLayerOptimized &operator=(const NEDepthwiseConvolutionLayerOptimized &) = delete;
+ /** Default move assignment operator */
+ NEDepthwiseConvolutionLayerOptimized &operator=(NEDepthwiseConvolutionLayerOptimized &&) = default;
+ /** Initialize the function's source, destination, kernels and border_size.
+ *
+ * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+ * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input.
+ * @param[out] output Destination tensor. Data type supported: same as @p input.
+ * @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ */
+ void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+
+ /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+ * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input.
+ * @param[in] output Destination tensor. Data type supported: same as @p input.
+ * @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+
+ // Inherited methods overriden:
+ void run() override;
+ void prepare() override;
+
+private:
+ /** Configure the kernels/functions for the generic pipeline.
+ *
+ * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+ * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input.
+ * @param[out] output Destination tensor. Data type supported: same as @p input.
+ * @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+ * @param[in] act_info Activation layer information in case of a fused activation.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ *
+ */
+ void configure_generic(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U));
+ /** Configure the kernels/functions for the optimized pipeline.
+ *
+ * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
+ * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p input.
+ * @param[out] output Destination tensor. Data type supported: same as @p input.
+ * @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+ * @param[in] act_info Activation layer information in case of a fused activation.
+ */
+ void configure_optimized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U));
+ /** Run generic kernel */
+ void run_generic();
+ /** Run optimized function */
+ void run_optimized();
+
+private:
+ MemoryGroup _memory_group;
+ NEDepthwiseConvolutionLayer3x3Kernel _dwc_kernel;
+ NEDepthwiseConvolutionAssemblyDispatch _dwc_optimized_func;
+ NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
+ NEFillBorderKernel _border_handler;
+ NEPermute _permute_input;
+ NEPermute _permute_weights;
+ NEPermute _permute_output;
+ NEActivationLayer _activationlayer_function;
+ Tensor _accumulator;
+ Tensor _permuted_input;
+ Tensor _permuted_weights;
+ Tensor _permuted_output;
+ const ITensor *_original_weights;
+ bool _has_bias;
+ bool _is_quantized;
+ bool _is_optimized;
+ bool _is_nchw;
+ bool _permute;
+ bool _is_activationlayer_enabled;
+ bool _is_prepared;
+};
+
/** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernels:
*
* -# @ref NEDepthwiseIm2ColKernel
diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
index 7d2cff7315..b88e750fa9 100644
--- a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
+++ b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
@@ -30,9 +30,6 @@
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-
namespace arm_compute
{
/** Depthwise convolution assembly kernel glue */
@@ -52,38 +49,44 @@ public:
NEDepthwiseConvolutionAssemblyDispatch &operator=(const NEDepthwiseConvolutionAssemblyDispatch &) = delete;
/** Default move assignment operator */
NEDepthwiseConvolutionAssemblyDispatch &operator=(NEDepthwiseConvolutionAssemblyDispatch &&) = default;
+ /** Default destructor */
+ ~NEDepthwiseConvolutionAssemblyDispatch();
/** Initialize the function's source, destination, kernels and border_size.
*
* @note Supports only NHWC format
*
* @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
- * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+ * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
* @param[in] bias (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
* Data type supported: Same as @p input.
* @param[out] output Destination tensor. Data type supported: same as @p input.
* @param[in] conv_info Padding and stride information to use for the convolution.
* @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
*/
void configure(const ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1, 1));
/** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionAssemblyDispatch
*
* @note Supports only NHWC format
*
* @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
- * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+ * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
* @param[in] bias (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
* Data type supported: Same as @p input.
* @param[out] output Destination tensor. Data type supported: same as @p input.
* @param[in] conv_info Padding and stride information to use for the convolution.
* @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
*
* @return An error status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ const Size2D &dilation = Size2D(1, 1));
/** Check if the optimized kernel can be used for the given kernel sizes and strides
*
* @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
@@ -103,16 +106,18 @@ public:
void prepare() override;
private:
- MemoryGroup _memory_group;
- const ITensor *_input;
- const ITensor *_weights;
- const ITensor *_bias;
- ITensor *_output;
- Tensor _packed_weights;
- Tensor _workspace;
- bool _is_prepared;
- std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel;
- NEDepthwiseConvolutionAssemblyKernelWrapper _dwc_acl_kernel;
+ struct LocalImpl;
+
+private:
+ MemoryGroup _memory_group;
+ const ITensor *_input;
+ const ITensor *_weights;
+ const ITensor *_bias;
+ ITensor *_output;
+ Tensor _packed_weights;
+ Tensor _workspace;
+ bool _is_prepared;
+ std::unique_ptr<LocalImpl> _pImpl;
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__ */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 8aa43201ad..f216519e58 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -241,6 +241,7 @@ v19.08 Public major release
- Various optimisations.
- Deprecated functions/interfaces
- Altered @ref QuantizationInfo interface to support per-channel quantization.
+ - The @ref NEDepthwiseConvolutionLayer3x3 will be replaced by @ref NEDepthwiseConvolutionLayerOptimized to accommodate for future optimizations.
v19.05 Public major release
- Various bug fixes.
diff --git a/src/core/NEON/kernels/convolution/common/padding.cpp b/src/core/NEON/kernels/convolution/common/padding.cpp
index b50067b4e0..88b37b8a83 100644
--- a/src/core/NEON/kernels/convolution/common/padding.cpp
+++ b/src/core/NEON/kernels/convolution/common/padding.cpp
@@ -24,8 +24,8 @@
#include <cstring>
#include <cstdint>
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
+#include "arm.hpp"
+#include "padding.hpp"
namespace padding
{
diff --git a/src/core/NEON/kernels/convolution/common/qasymm8.cpp b/src/core/NEON/kernels/convolution/common/qasymm8.cpp
index 1de9ebf28a..64e3156bff 100644
--- a/src/core/NEON/kernels/convolution/common/qasymm8.cpp
+++ b/src/core/NEON/kernels/convolution/common/qasymm8.cpp
@@ -28,7 +28,7 @@
#include <cmath>
#include <limits>
-#include "arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp"
+#include "qasymm8.hpp"
namespace qasymm8
{
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
new file mode 100644
index 0000000000..3e2bbbb61a
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "impl_dilated.hpp"
+
+template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
+template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
+template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
+template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
+template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
+template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>;
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
new file mode 100644
index 0000000000..879e06158d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_quantized_dilated.hpp"
+#include "impl_dilated.hpp"
+
+namespace depthwise {
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols>
+QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
+ KernelCols, StrideRows, StrideCols>::
+ QAsymm8DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor, nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params &weight_quantisation,
+ const qasymm8::QAsymm8Params &input_quantisation,
+ const qasymm8::QAsymm8Params &output_quantisation,
+ unsigned int padding_top, unsigned int padding_left,
+ unsigned int padding_bottom, unsigned int padding_right)
+ : QAsymm8DilatedDepthwiseConvolution(
+ n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+ QAsymm8DilatedDepthwiseConvolution::get_output_size(
+ n_input_rows, padding_top, padding_bottom, dilation_factor),
+ QAsymm8DilatedDepthwiseConvolution::get_output_size(
+ n_input_cols, padding_left, padding_right, dilation_factor),
+ activation, weight_quantisation, input_quantisation,
+ output_quantisation, padding_top, padding_left, padding_bottom,
+ padding_right) {}
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols>
+QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
+ KernelCols, StrideRows, StrideCols>::
+ QAsymm8DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor, int n_output_rows, int n_output_cols,
+ nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params &weight_quantisation,
+ const qasymm8::QAsymm8Params &input_quantisation,
+ const qasymm8::QAsymm8Params &output_quantisation,
+ unsigned int padding_top, unsigned int padding_left,
+ unsigned int padding_bottom, unsigned int padding_right)
+ : QAsymm8DilatedDepthwiseConvolution(
+ n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+ n_output_rows, n_output_cols, activation, weight_quantisation,
+ input_quantisation, output_quantisation,
+ qasymm8::QAsymm8RescaleParams::make_rescale_params(
+ weight_quantisation, input_quantisation, output_quantisation),
+ padding_top, padding_left, padding_bottom, padding_right) {}
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols>
+QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
+ KernelCols, StrideRows, StrideCols>::
+ QAsymm8DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor, nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params &weight_quantisation,
+ const qasymm8::QAsymm8Params &input_quantisation,
+ const qasymm8::QAsymm8Params &output_quantisation,
+ const qasymm8::QAsymm8RescaleParams &rescale_parameters,
+ unsigned int padding_top, unsigned int padding_left,
+ unsigned int padding_bottom, unsigned int padding_right)
+ : QAsymm8DilatedDepthwiseConvolution(
+ n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+ QAsymm8DilatedDepthwiseConvolution::get_output_size(
+ n_input_rows, padding_top, padding_bottom, dilation_factor),
+ QAsymm8DilatedDepthwiseConvolution::get_output_size(
+ n_input_cols, padding_left, padding_right, dilation_factor),
+ activation, weight_quantisation, input_quantisation,
+ output_quantisation, rescale_parameters, padding_top, padding_left,
+ padding_bottom, padding_right) {}
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols>
+QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
+ KernelCols, StrideRows, StrideCols>::
+ QAsymm8DilatedDepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int dilation_factor, int n_output_rows, int n_output_cols,
+ nck::ActivationFunction activation,
+ const qasymm8::QAsymm8Params &weight_quantisation,
+ const qasymm8::QAsymm8Params &input_quantisation,
+ const qasymm8::QAsymm8Params &output_quantisation,
+ const qasymm8::QAsymm8RescaleParams &rescale_parameters,
+ unsigned int padding_top, unsigned int padding_left,
+ unsigned int padding_bottom, unsigned int padding_right)
+ : DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
+ KernelCols, StrideRows, StrideCols, uint8_t,
+ int32_t, uint8_t>(
+ n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
+ n_output_rows, n_output_cols, activation, padding_top, padding_left,
+ padding_bottom, padding_right,
+ [weight_quantisation, input_quantisation, output_quantisation,
+ rescale_parameters](
+ const int n_batches, const int n_input_rows,
+ const int n_input_cols, const int n_channels,
+ const int n_output_rows, const int n_output_cols,
+ const nck::ActivationFunction activation,
+ const unsigned int padding_top, const unsigned int padding_left,
+ const unsigned int padding_bottom,
+ const unsigned int padding_right) -> IDepthwiseConvolution * {
+ return new QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols,
+ StrideRows, StrideCols>(
+ n_batches, n_input_rows, n_input_cols, n_channels,
+ n_output_rows, n_output_cols, activation, weight_quantisation,
+ input_quantisation, output_quantisation, rescale_parameters,
+ padding_top, padding_left, padding_bottom, padding_right);
+ }) {}
+
+} // namespace depthwise
+
+template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>;
+template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>;
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
index 692086c74a..f86f1bad73 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+#include "impl_base.hpp"
// TODO Move to common utilities somewhere
template <size_t Size> struct DType { };
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
index cbdb19a067..87d2bfd8e6 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
@@ -30,8 +30,8 @@
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
*/
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+#include "arm.hpp"
+#include "impl_base.hpp"
#pragma once
@@ -68,6 +68,31 @@ template <
unsigned int KernelRows, unsigned int KernelCols,
unsigned int StrideRows, unsigned int StrideCols
>
+DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols, StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+>::DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int n_output_rows, int n_output_cols,
+ ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+) : Base(
+ n_batches, n_input_rows, n_input_cols, n_channels,
+ n_output_rows, n_output_cols, activation,
+ padding_top, padding_left, padding_bottom, padding_right
+ )
+{
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
template <ActivationFunction Activation>
void DepthwiseConvolution<
OutputTileRows, OutputTileCols,
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
index 264576137c..e19e4c668c 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
@@ -30,8 +30,8 @@
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
*/
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+#include "arm.hpp"
+#include "impl_base.hpp"
#pragma once
@@ -63,6 +63,30 @@ DepthwiseConvolution<
{
}
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols, StrideRows, StrideCols,
+ float, float, float
+>::DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int n_output_rows, int n_output_cols,
+ ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+) : Base(
+ n_batches, n_input_rows, n_input_cols, n_channels,
+ n_output_rows, n_output_cols, activation,
+ padding_top, padding_left, padding_bottom, padding_right
+ )
+{
+}
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
index 5546d37e59..bda875dfe1 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
@@ -32,16 +32,39 @@
#include <limits>
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
+#include "arm.hpp"
+#include "impl_base.hpp"
+#include "depthwise_quantized.hpp"
#pragma once
+// Comment the following to use floating-point based quantisation, leave
+// uncommented to use fixed-point.
+#define FIXED_POINT_REQUANTISATION 1
+
using namespace neon_convolution_kernels;
using namespace qasymm8;
template <typename T>
+struct clamp_to_limits
+{
+ template <typename U>
+ static inline U clamp(const U& v)
+ {
+ const std::numeric_limits<T> limits;
+ const U min = static_cast<U>(limits.min());
+ const U max = static_cast<U>(limits.max());
+ return std::min(std::max(v, min), max);
+ }
+
+ template <typename U>
+ static inline T clamp_and_cast(const U& v)
+ {
+ return static_cast<U>(clamp(v));
+ }
+};
+
+template <typename T>
inline T saturating_doubling_high_mul(const T&, const int32_t&);
template <>
@@ -87,103 +110,214 @@ inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
namespace depthwise
{
template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
>
QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::QAsymm8DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- const ActivationFunction activation,
- const QAsymm8Params& weight_quantisation,
- const QAsymm8Params& input_quantisation,
- const QAsymm8Params& output_quantisation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- ) : QAsymm8DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels,
- activation, weight_quantisation, input_quantisation, output_quantisation,
- QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
- padding_top, padding_left, padding_bottom, padding_right
-)
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ const ActivationFunction activation,
+ const QAsymm8Params& weight_quantisation,
+ const QAsymm8Params& input_quantisation,
+ const QAsymm8Params& output_quantisation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+) : QAsymm8DepthwiseConvolution(
+ n_batches, n_input_rows, n_input_cols, n_channels,
+ activation, weight_quantisation, input_quantisation, output_quantisation,
+ QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
+ padding_top, padding_left, padding_bottom, padding_right
+ )
{
}
template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
>
QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::QAsymm8DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- const ActivationFunction activation,
- const QAsymm8Params& weight_quantisation,
- const QAsymm8Params& input_quantisation,
- const QAsymm8Params& output_quantisation,
- const QAsymm8RescaleParams& rescale_params,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
- ) : Base(
- n_batches, n_input_rows, n_input_cols, n_channels, activation,
- padding_top, padding_left, padding_bottom, padding_right
-),
- _weights_quant(weight_quantisation),
- _inputs_quant(input_quantisation),
- _output_quant(output_quantisation),
- rescale_parameters(rescale_params)
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int n_output_rows, int n_output_cols,
+ const ActivationFunction activation,
+ const QAsymm8Params& weight_quantisation,
+ const QAsymm8Params& input_quantisation,
+ const QAsymm8Params& output_quantisation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+) : QAsymm8DepthwiseConvolution(
+ n_batches, n_input_rows, n_input_cols, n_channels,
+ n_output_rows, n_output_cols,
+ activation, weight_quantisation, input_quantisation, output_quantisation,
+ QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
+ padding_top, padding_left, padding_bottom, padding_right
+ )
{
}
template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::QAsymm8DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ const ActivationFunction activation,
+ const QAsymm8Params& weight_quantisation,
+ const QAsymm8Params& input_quantisation,
+ const QAsymm8Params& output_quantisation,
+ const QAsymm8RescaleParams& rescale_params,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+) : Base(
+ n_batches, n_input_rows, n_input_cols, n_channels,
+ get_activation_fn(activation, output_quantisation),
+ padding_top, padding_left, padding_bottom, padding_right
+ ),
+ _weights_quant(weight_quantisation),
+ _inputs_quant(input_quantisation),
+ _output_quant(output_quantisation),
+ rescale_parameters(rescale_params)
+{
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::QAsymm8DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ int n_output_rows, int n_output_cols,
+ const ActivationFunction activation,
+ const QAsymm8Params& weight_quantisation,
+ const QAsymm8Params& input_quantisation,
+ const QAsymm8Params& output_quantisation,
+ const QAsymm8RescaleParams& rescale_params,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+) : Base(
+ n_batches, n_input_rows, n_input_cols, n_channels,
+ n_output_rows, n_output_cols,
+ get_activation_fn(activation, output_quantisation),
+ padding_top, padding_left, padding_bottom, padding_right
+ ),
+ _weights_quant(weight_quantisation),
+ _inputs_quant(input_quantisation),
+ _output_quant(output_quantisation),
+ rescale_parameters(rescale_params)
+{
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+ActivationFunction QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::get_activation_fn(
+ const ActivationFunction activation,
+ const QAsymm8Params& output_quant
+)
+{
+ if (
+ (activation == ActivationFunction::ReLU &&
+ output_quant.quantize(0) == 0) ||
+ (activation == ActivationFunction::ReLU6 &&
+ output_quant.quantize(0) == 0 &&
+ output_quant.dequantize(255) <= 6.0f)
+ )
+ {
+ // If the range of values which can be represented by a quantized value are
+ // within the range that would be produced by the activation function, then
+ // the activation function is redundant and can be skipped.
+ return ActivationFunction::None;
+ }
+ else if(
+ activation == ActivationFunction::ReLU6 &&
+ output_quant.dequantize(255) <= 6.0f
+ )
+ {
+ // If the largest value that can be represented by a quantized value is
+ // lower than the upper boundary, then the activation function can be
+ // relaxed to a ReLU.
+ return ActivationFunction::ReLU;
+ }
+
+ return activation;
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
>
uint8_t QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::_input_padding_value(void) const
{
return _inputs_quant.offset;
}
template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
>
void QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::_pack_params(
- void * const buffer,
- const void * const weights,
- const unsigned int weight_row_stride,
- const unsigned int weight_col_stride,
- const void * const biases
- ) const
+ void * const buffer,
+ const void * const weights,
+ const unsigned int weight_row_stride,
+ const unsigned int weight_col_stride,
+ const void * const biases
+) const
{
const uint8_t *wptr = static_cast<const uint8_t *>(weights);
const int32_t *bptr = static_cast<const int32_t *>(biases);
uint8_t *outptr = static_cast<uint8_t *>(buffer);
- // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE
- // For SVE set this to half the vector length.
+ // We set the vector length to use quad registers on Aarch64 and only doubles
+ // on Aarch32. NOTE For SVE set this to the actual vector length.
+#if defined(__aarch64__)
+ unsigned int veclen = 16;
+#else
+#if defined(__arm__)
unsigned int veclen = 8;
+#endif
+#endif
+
+ // Compute the rank 0 offset arising from the quantisation parameters.
+ const int32_t rank0_offset = (KernelRows * KernelCols *
+ static_cast<int32_t>(_weights_quant.offset) *
+ static_cast<int32_t>(_inputs_quant.offset));
// While there are channels left to process, pack a vector length of them at
// a time and reduce the size of vector used as the size of the tensor
// decreases.
for (
- unsigned int n_channels = this->n_channels(); n_channels;
- n_channels -= veclen,
- outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
- )
+ unsigned int n_channels = this->n_channels(); n_channels;
+ n_channels -= veclen,
+ outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
+ )
{
// NOTE Ignore this section if using SVE, the vector length remains the
// same and we just don't fill a full register for the tail.
@@ -201,8 +335,8 @@ void QAsymm8DepthwiseConvolution<
// Copy a vector length of elements
for (unsigned int n = 0; n < veclen && n < n_channels; n++)
{
- const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
- out_bptr[n] = bias;
+ int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
+ uint32_t weight_sum = 0;
for (unsigned int i = 0; i < KernelRows; i++)
{
@@ -211,140 +345,297 @@ void QAsymm8DepthwiseConvolution<
{
uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
row_outptr[j*veclen + n] = w;
+ weight_sum += static_cast<uint32_t>(w);
}
}
wptr++;
+
+ // Include in the bias contributions from the quantisation offset
+ int32_t rank1_offset = static_cast<int32_t>(
+ static_cast<uint32_t>(_inputs_quant.offset) * weight_sum
+ );
+ out_bptr[n] = bias + rank0_offset - rank1_offset;
}
}
}
template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename FInput, typename FOutput
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
>
-static inline void tilefn(
- int n_channels,
- const void* packed_params,
- FInput &get_input_ptr,
- FOutput &get_output_ptr,
- const int32_t clamp_max,
- const int32_t clamp_min,
- const uint8_t input_offset,
- const uint8_t weight_offset,
- const uint8_t output_offset,
- const int32_t requant_multiplier,
- const int32_t requant_shift
- )
+template<ActivationFunction Activation>
+void QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const uint8_t* inptr,
+ const unsigned int in_row_stride,
+ const unsigned int in_col_stride,
+ uint8_t* outptr,
+ const unsigned int out_row_stride,
+ const unsigned int out_col_stride
+)
{
- constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
- constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
-
- // Offset into channels
- int channel = 0;
+ // Activation parameters (unused if Activation is None)
+ const uint8_t aqmin = _output_quant.offset;
+ const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
+ std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
// Byte type pointer to weights and biases
const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
- for (; n_channels >= 8; n_channels -= 8, channel += 8)
+#if defined(__aarch64__) // Under Aarch64 only use quad registers
+ for (; n_channels >= 16; n_channels -= 16)
+ {
+ // Load biases
+ const int32x4_t biases[4] = {
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
+ };
+ wbptr += 16*sizeof(int32_t);
+
+ // Load weights
+ uint8x16_t weights[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ weights[i][j] = vld1q_u8(wbptr);
+ wbptr += 16;
+ }
+ }
+
+ // Load the input activations
+ uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ inputs[i][j] = vld1q_u8(inptr + i*in_row_stride + j*in_col_stride);
+ }
+ }
+ inptr += 16;
+
+ // Perform the convolution
+ for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+ {
+ for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+ {
+ // Two sets of operations are required, we perform the
+ // multiply-accumulates for the convolution proper but must also sum
+ // the tile elements to account for the _weight_ offset.
+ uint32x4_t accs[4];
+ for (unsigned int i = 0; i < 4; i++)
+ {
+ accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
+ }
+
+ for (unsigned int wi = 0; wi < KernelRows; wi++)
+ {
+ for (unsigned int wj = 0; wj < KernelCols; wj++)
+ {
+ // Get relevant weight and activation pixel
+ const uint8x16_t w = weights[wi][wj];
+ const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+
+ // Perform multiplication and accumulation
+ const uint16x8_t muls[2] = {
+ vmull_u8(vget_low_u8(w), vget_low_u8(x)),
+ vmull_u8(vget_high_u8(w), vget_high_u8(x))
+ };
+
+ const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
+ const uint16x8_t sum_elems[2] = {
+ vmull_u8(vget_low_u8(x), woffset),
+ vmull_u8(vget_high_u8(x), woffset)
+ };
+
+ const uint32x4_t tmps[4] = {
+ vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
+ vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
+ vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
+ vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
+ };
+ for (unsigned int i = 0; i < 4; i++)
+ {
+ accs[i] = vaddq_u32(accs[i], tmps[i]);
+ }
+ }
+ }
+
+ // Rescale the accumulator and add in the new offset.
+ uint32x4_t final_accs[4];
+ for (unsigned int i = 0; i < 4; i++)
+ {
+#ifdef FIXED_POINT_REQUANTISATION
+ const int32x4_t y = rounding_divide_by_exp2(
+ saturating_doubling_high_mul(
+ reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
+ ),
+ rescale_parameters.shift
+ );
+ const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
+ final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
+#else // floating point requantisation
+ float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
+ fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
+ fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
+ fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
+ final_accs[i] = vcvtq_u32_f32(fp_acc);
+#endif
+ }
+
+ uint8x16_t output = vcombine_u8(
+ vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
+ vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
+ );
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ output = vmaxq_u8(output, vdupq_n_u8(aqmin));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ output = vminq_u8(output, vdupq_n_u8(aqmax));
+ }
+
+ vst1q_u8(outptr + oi*out_row_stride + oj*out_col_stride, output);
+ }
+ }
+ outptr += 16;
+ }
+#endif // defined(__aarch64__)
+ for (; n_channels >= 8; n_channels -= 8)
{
const int32x4_t biases[2] = {
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
};
wbptr += 8*sizeof(int32_t);
- int16x8_t weights[KernelRows][KernelCols];
- const uint8x8_t woffset = vdup_n_u8(weight_offset);
+ uint8x8_t weights[KernelRows][KernelCols];
for (unsigned int i = 0; i < KernelRows; i++)
{
for (unsigned int j = 0; j < KernelCols; j++)
{
- const uint8x8_t w = vld1_u8(wbptr);
- weights[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(w, woffset));
+ weights[i][j] = vld1_u8(wbptr);
wbptr += 8;
}
}
- int16x8_t inputs[InnerTileRows][InnerTileCols];
- const uint8x8_t ioffset = vdup_n_u8(input_offset);
- for (unsigned int i = 0; i < InnerTileRows; i++)
+ uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
{
- for (unsigned int j = 0; j < InnerTileCols; j++)
+ for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
{
- const auto x = vld1_u8(get_input_ptr(i, j, channel));
- inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
+ inputs[i][j] = vld1_u8(inptr + i*in_row_stride + j*in_col_stride);
}
}
+ inptr += 8;
for (unsigned int oi = 0; oi < OutputTileRows; oi++)
{
for (unsigned int oj = 0; oj < OutputTileCols; oj++)
{
- int32x4_t accs[2];
+ uint32x4_t accs[2];
for (unsigned int i = 0; i < 2; i++)
{
- accs[i] = biases[i];
+ accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
}
for (unsigned int wi = 0; wi < KernelRows; wi++)
{
for (unsigned int wj = 0; wj < KernelCols; wj++)
{
- const auto w = weights[wi][wj];
- const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
- accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
- accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
+ const uint8x8_t w = weights[wi][wj];
+ const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+
+ const uint16x8_t muls = vmull_u8(w, x);
+ const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
+ const uint16x8_t sum_elems = vmull_u8(x, woffset);
+
+ const uint32x4_t tmps[2] = {
+ vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
+ vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
+ };
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ accs[i] = vaddq_u32(accs[i], tmps[i]);
+ }
}
}
- int32x4_t final_accs[2];
+ uint32x4_t final_accs[2];
for (unsigned int i = 0; i < 2; i++)
{
+#ifdef FIXED_POINT_REQUANTISATION
const int32x4_t y = rounding_divide_by_exp2(
- saturating_doubling_high_mul(accs[i], requant_multiplier),
- requant_shift);
- const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
- final_accs[i] = vaddq_s32(y, offset);
- final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
- final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
+ saturating_doubling_high_mul(
+ reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
+ ),
+ rescale_parameters.shift
+ );
+ const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
+ final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
+#else // floating point requantisation
+ float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
+ fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
+ fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
+ fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
+ final_accs[i] = vcvtq_u32_f32(fp_acc);
+#endif
}
- const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
- vreinterpretq_s16_s32(final_accs[1]));
- const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
- const uint8x8_t output =
- vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
- vst1_u8(get_output_ptr(oi, oj, channel), output);
+ uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ output = vmax_u8(output, vdup_n_u8(aqmin));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ output = vmin_u8(output, vdup_n_u8(aqmax));
+ }
+
+ vst1_u8(outptr + oi*out_row_stride + oj*out_col_stride, output);
}
}
+ outptr += 8;
}
- for (; n_channels; n_channels--, channel++)
+ for (; n_channels; n_channels--)
{
// Load bias
const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
wbptr += sizeof(int32_t);
// Load weights
- int16_t weights[KernelRows][KernelCols];
+ uint8_t weights[KernelRows][KernelCols];
for (unsigned int i = 0; i < KernelRows; i++)
{
for (unsigned int j = 0; j < KernelCols; j++)
{
- weights[i][j] = *(wbptr++) - weight_offset;
+ weights[i][j] = *(wbptr++);
}
}
// Load the input activations
- int16_t inputs[InnerTileRows][InnerTileCols];
- for (unsigned int i = 0; i < InnerTileRows; i++)
+ uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
{
- for (unsigned int j = 0; j < InnerTileCols; j++)
+ for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
{
- inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
+ inputs[i][j] = *(inptr + i*in_row_stride + j*in_col_stride);
}
}
+ inptr++;
// Perform the convolution
for (unsigned int oi = 0; oi < OutputTileRows; oi++)
@@ -352,135 +643,377 @@ static inline void tilefn(
for (unsigned int oj = 0; oj < OutputTileCols; oj++)
{
int32_t acc = bias;
+ uint32_t element_sum = 0;
for (unsigned int wi = 0; wi < KernelRows; wi++)
{
for (unsigned int wj = 0; wj < KernelCols; wj++)
{
const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
- acc += w * x;
+ acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
+ element_sum += static_cast<uint32_t>(x);
}
}
+ acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
+
// Requantize
+#ifdef FIXED_POINT_REQUANTISATION
acc = rounding_divide_by_exp2(
- saturating_doubling_high_mul(acc, requant_multiplier),
- requant_shift);
- acc += output_offset;
- acc = std::max(acc, clamp_min);
- acc = std::min(acc, clamp_max);
- uint8_t output = static_cast<uint8_t>(acc);
- *(get_output_ptr(oi, oj, channel)) = output;
+ saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
+ rescale_parameters.shift
+ );
+ acc += _output_quant.offset;
+ uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
+#else // floating point requantization
+ float fp_acc = static_cast<float>(acc);
+ fp_acc *= rescale_parameters.rescale;
+ fp_acc += static_cast<float>(_output_quant.offset);
+ fp_acc = std::max<float>(fp_acc, 0.0f);
+ uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
+#endif
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ output = std::max(output, aqmin);
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ output = std::min(output, aqmax);
+ }
+
+ *(outptr + oi*out_row_stride + oj*out_col_stride) = output;
}
}
+ outptr++;
}
}
template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename FInput, typename FOutput
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
>
-static inline void execute_tilefn(
- int n_channels,
- const void* packed_params,
- const nck::ActivationFunction actfn,
- FInput &get_input_ptr,
- FOutput &get_output_ptr,
- const QAsymm8Params &input_quant,
- const QAsymm8Params &weight_quant,
- const QAsymm8Params &output_quant,
- const QAsymm8RescaleParams &requant
- ) {
- // Compute min/max clamp values
- int32_t clamp_min = std::numeric_limits<uint8_t>::min();
- int32_t clamp_max = std::numeric_limits<uint8_t>::max();
-
- if (actfn == nck::ActivationFunction::ReLU ||
- actfn == nck::ActivationFunction::ReLU6) {
- const int32_t bottom_rail = output_quant.offset;
- clamp_min = std::max(clamp_min, bottom_rail);
+template<ActivationFunction Activation>
+void QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ // Activation parameters (unused if Activation is None)
+ const uint8_t aqmin = _output_quant.offset;
+ const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
+ std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
+
+ // Byte type pointer to weights and biases
+ const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
+
+ // Offset into input/output tensors
+ int n = 0;
+
+#if defined(__aarch64__) // Under Aarch64 only use quad registers
+ for (; n_channels >= 16; n_channels -= 16, n += 16)
+ {
+ // Load biases
+ const int32x4_t biases[4] = {
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
+ };
+ wbptr += 16*sizeof(int32_t);
+
+ // Load weights
+ uint8x16_t weights[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ weights[i][j] = vld1q_u8(wbptr);
+ wbptr += 16;
+ }
+ }
+
+ // Load the input activations
+ uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ inputs[i][j] = vld1q_u8(inptrs[i][j] + n);
+ }
+ }
+
+ // Perform the convolution
+ for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+ {
+ for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+ {
+ // Two sets of operations are required, we perform the
+ // multiply-accumulates for the convolution proper but must also sum
+ // the tile elements to account for the _weight_ offset.
+ uint32x4_t accs[4];
+ for (unsigned int i = 0; i < 4; i++)
+ {
+ accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
+ }
+
+ for (unsigned int wi = 0; wi < KernelRows; wi++)
+ {
+ for (unsigned int wj = 0; wj < KernelCols; wj++)
+ {
+ // Get relevant weight and activation pixel
+ const uint8x16_t w = weights[wi][wj];
+ const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+
+ // Perform multiplication and accumulation
+ const uint16x8_t muls[2] = {
+ vmull_u8(vget_low_u8(w), vget_low_u8(x)),
+ vmull_u8(vget_high_u8(w), vget_high_u8(x))
+ };
+
+ const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
+ const uint16x8_t sum_elems[2] = {
+ vmull_u8(vget_low_u8(x), woffset),
+ vmull_u8(vget_high_u8(x), woffset)
+ };
+
+ const uint32x4_t tmps[4] = {
+ vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
+ vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
+ vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
+ vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
+ };
+ for (unsigned int i = 0; i < 4; i++)
+ {
+ accs[i] = vaddq_u32(accs[i], tmps[i]);
+ }
+ }
+ }
+
+ // Rescale the accumulator and add in the new offset.
+ uint32x4_t final_accs[4];
+ for (unsigned int i = 0; i < 4; i++)
+ {
+#ifdef FIXED_POINT_REQUANTISATION
+ const int32x4_t y = rounding_divide_by_exp2(
+ saturating_doubling_high_mul(
+ reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
+ ),
+ rescale_parameters.shift
+ );
+ const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
+ final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
+#else // floating point requantisation
+ float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
+ fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
+ fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
+ fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
+ final_accs[i] = vcvtq_u32_f32(fp_acc);
+#endif
+ }
+
+ uint8x16_t output = vcombine_u8(
+ vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
+ vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
+ );
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ output = vmaxq_u8(output, vdupq_n_u8(aqmin));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ output = vminq_u8(output, vdupq_n_u8(aqmax));
+ }
+
+ vst1q_u8(outptrs[oi][oj] + n, output);
+ }
+ }
}
+#endif // defined(__aarch64__)
+ for (; n_channels >= 8; n_channels -= 8, n += 8)
+ {
+ const int32x4_t biases[2] = {
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+ };
+ wbptr += 8*sizeof(int32_t);
+
+ uint8x8_t weights[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ weights[i][j] = vld1_u8(wbptr);
+ wbptr += 8;
+ }
+ }
+
+ uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ inputs[i][j] = vld1_u8(inptrs[i][j] + n);
+ }
+ }
+
+ for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+ {
+ for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+ {
+ uint32x4_t accs[2];
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
+ }
+
+ for (unsigned int wi = 0; wi < KernelRows; wi++)
+ {
+ for (unsigned int wj = 0; wj < KernelCols; wj++)
+ {
+ const uint8x8_t w = weights[wi][wj];
+ const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+
+ const uint16x8_t muls = vmull_u8(w, x);
+ const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
+ const uint16x8_t sum_elems = vmull_u8(x, woffset);
+
+ const uint32x4_t tmps[2] = {
+ vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
+ vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
+ };
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ accs[i] = vaddq_u32(accs[i], tmps[i]);
+ }
+ }
+ }
+
+ uint32x4_t final_accs[2];
+ for (unsigned int i = 0; i < 2; i++)
+ {
+#ifdef FIXED_POINT_REQUANTISATION
+ const int32x4_t y = rounding_divide_by_exp2(
+ saturating_doubling_high_mul(
+ reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
+ ),
+ rescale_parameters.shift
+ );
+ const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
+ final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
+#else // floating point requantisation
+ float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
+ fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
+ fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
+ fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
+ final_accs[i] = vcvtq_u32_f32(fp_acc);
+#endif
+ }
+
+ uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ output = vmax_u8(output, vdup_n_u8(aqmin));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ output = vmin_u8(output, vdup_n_u8(aqmax));
+ }
- if (actfn == nck::ActivationFunction::ReLU6) {
- const int32_t top_rail = output_quant.quantize(6.0f);
- clamp_max = std::min(clamp_max, top_rail);
+ vst1_u8(outptrs[oi][oj] + n, output);
+ }
+ }
}
+ for (; n_channels; n_channels--, n++)
+ {
+ // Load bias
+ const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
+ wbptr += sizeof(int32_t);
- // Call the tile execution method
- tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
- StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr,
- clamp_max, clamp_min, input_quant.offset,
- weight_quant.offset, output_quant.offset,
- requant.multiplier, requant.shift);
-}
+ // Load weights
+ uint8_t weights[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ weights[i][j] = *(wbptr++);
+ }
+ }
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <nck::ActivationFunction Activation>
-void QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- uint8_t* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride
- ) {
- // Construct methods to get pointers
- const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
- const int i, const int j, const int channel) {
- return inptr + i * in_row_stride + j * in_col_stride + channel;
- };
-
- const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
- const int i, const int j, const int channel) {
- return outptr + i * out_row_stride + j * out_col_stride + channel;
- };
-
- execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
- StrideRows, StrideCols>(
- n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
- _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
-}
+ // Load the input activations
+ uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ inputs[i][j] = *(inptrs[i][j] + n);
+ }
+ }
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <nck::ActivationFunction Activation>
-void QAsymm8DepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
- ) {
- // Construct methods to get pointers
- const auto get_input_ptr = [inptrs](const int i, const int j,
- const int channel) {
- return inptrs[i][j] + channel;
- };
-
- const auto get_output_ptr = [outptrs](const int i, const int j,
- const int channel) {
- return outptrs[i][j] + channel;
- };
-
- // Call the tile execution method
- execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
- StrideRows, StrideCols>(
- n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
- _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
+ // Perform the convolution
+ for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+ {
+ for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+ {
+ int32_t acc = bias;
+ uint32_t element_sum = 0;
+
+ for (unsigned int wi = 0; wi < KernelRows; wi++)
+ {
+ for (unsigned int wj = 0; wj < KernelCols; wj++)
+ {
+ const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+ acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
+ element_sum += static_cast<uint32_t>(x);
+ }
+ }
+
+ acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
+
+ // Requantize
+#ifdef FIXED_POINT_REQUANTISATION
+ acc = rounding_divide_by_exp2(
+ saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
+ rescale_parameters.shift
+ );
+ acc += _output_quant.offset;
+ uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
+#else // floating point requantization
+ float fp_acc = static_cast<float>(acc);
+ fp_acc *= rescale_parameters.rescale;
+ fp_acc += static_cast<float>(_output_quant.offset);
+ fp_acc = std::max<float>(fp_acc, 0.0f);
+ uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
+#endif
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ output = std::max(output, aqmin);
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ output = std::min(output, aqmax);
+ }
+
+ *(outptrs[oi][oj] + n) = output;
+ }
+ }
+ }
}
} // namespace depthwise
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index 9f8064e924..c14100ab42 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -59,8 +59,8 @@ struct CLConvolutionLayerFunctions
/** Collection of CL depthwise convolution functions */
struct CLDepthwiseConvolutionLayerFunctions
{
- using GenericDepthwiseConvolutionLayer = CLDepthwiseConvolutionLayer;
- using DepthwiseConvolutionLayer3x3 = CLDepthwiseConvolutionLayer3x3;
+ using GenericDepthwiseConvolutionLayer = CLDepthwiseConvolutionLayer;
+ using OptimizedDepthwiseConvolutionLayer = CLDepthwiseConvolutionLayer3x3;
};
/** Collection of CL element-wise functions */
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index c31072661b..d4892f53a6 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -65,8 +65,8 @@ struct NEConvolutionLayerFunctions
/** Collection of CL depthwise convolution functions */
struct NEDepthwiseConvolutionLayerFunctions
{
- using GenericDepthwiseConvolutionLayer = NEDepthwiseConvolutionLayer;
- using DepthwiseConvolutionLayer3x3 = NEDepthwiseConvolutionLayer3x3;
+ using GenericDepthwiseConvolutionLayer = NEDepthwiseConvolutionLayer;
+ using OptimizedDepthwiseConvolutionLayer = NEDepthwiseConvolutionLayerOptimized;
};
/** Collection of CL element-wise functions */
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 43288ec4c6..45cc2d2762 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -363,6 +363,333 @@ void NEDepthwiseConvolutionLayer3x3::prepare()
}
}
+NEDepthwiseConvolutionLayerOptimized::NEDepthwiseConvolutionLayerOptimized(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
+ _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
+ _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+{
+}
+
+void NEDepthwiseConvolutionLayerOptimized::configure_generic(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+
+ PixelValue zero_value(0.f);
+
+ // Initialize the intermediate accumulator tensor in case of quantized input
+ if(_is_quantized)
+ {
+ TensorShape accum_shape = output->info()->tensor_shape();
+ DataLayout accum_layout = output->info()->data_layout();
+ if(!_is_nchw)
+ {
+ permute(accum_shape, PermutationVector(1U, 2U, 0U));
+ accum_layout = DataLayout::NCHW;
+ }
+
+ _memory_group.manage(&_accumulator);
+ _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
+ _accumulator.info()->set_data_layout(accum_layout);
+ zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
+ }
+
+ if(!_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+ _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+
+ // Configure depthwise
+ _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
+
+ // Configure border handler
+ _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ }
+ else
+ {
+ // Configure depthwise convolution kernel
+ _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
+
+ // Configure border handler
+ _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+ }
+
+ // Configure biases accumulation
+ if(_is_quantized)
+ {
+ const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
+
+ float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
+ int output_multiplier;
+ int output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
+ _accumulator.allocator()->allocate();
+ }
+ else if(_has_bias)
+ {
+ _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
+ }
+
+ // Permute output
+ if(!_is_nchw)
+ {
+ // Configure the function to transform the convoluted output to NHWC
+ _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+ _permuted_output.allocator()->allocate();
+ }
+}
+
+void NEDepthwiseConvolutionLayerOptimized::configure_optimized(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+ const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
+ const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
+ _is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
+ if(!_is_activationlayer_enabled)
+ {
+ act_info_to_use = act_info;
+ }
+
+ if(_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+ _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+ // Configure the function to transform the weights tensor from IHW -> HWI
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+ _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+
+ // Configure optimized depthwise
+ _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
+
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+ else
+ {
+ _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
+ }
+}
+
+void NEDepthwiseConvolutionLayerOptimized::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+ // idx_w and idx_h only used for validation
+ const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_UNUSED(idx_w);
+ ARM_COMPUTE_UNUSED(idx_h);
+
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _has_bias = biases != nullptr;
+ _is_optimized = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
+ weights->info(),
+ conv_info,
+ depth_multiplier,
+ dilation);
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ _permute = _is_optimized == _is_nchw;
+ _is_prepared = false;
+ _is_activationlayer_enabled = act_info.enabled();
+
+ // Configure appropriate pipeline
+ if(_is_optimized)
+ {
+ configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ }
+ else
+ {
+ configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ }
+
+ // Configure activation
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
+}
+
+Status NEDepthwiseConvolutionLayerOptimized::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+ if(biases != nullptr)
+ {
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
+ }
+
+ if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
+ {
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier, dilation));
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
+ }
+
+ //Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+ }
+
+ return Status{};
+}
+
+void NEDepthwiseConvolutionLayerOptimized::run_generic()
+{
+ // Fill border
+ NEScheduler::get().schedule(&_border_handler, Window::DimX);
+
+ // Execute depthwise convolution
+ NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
+
+ // Add biases
+ if(_has_bias || _is_quantized)
+ {
+ NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
+ }
+
+ // Permute output
+ if(!_is_nchw)
+ {
+ _permute_output.run();
+ }
+}
+
+void NEDepthwiseConvolutionLayerOptimized::run_optimized()
+{
+ // Run assembly function
+ _dwc_optimized_func.run();
+
+ // Permute output
+ if(_is_nchw)
+ {
+ _permute_output.run();
+ }
+}
+
+void NEDepthwiseConvolutionLayerOptimized::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Permute input
+ if(_permute)
+ {
+ _permute_input.run();
+ }
+
+ _is_optimized ? run_optimized() : run_generic();
+
+ // Run activation
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
+}
+
+void NEDepthwiseConvolutionLayerOptimized::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Permute weights
+ if(_permute)
+ {
+ _permuted_weights.allocator()->allocate();
+ _permute_weights.run();
+ _original_weights->mark_as_unused();
+ }
+
+ // Prepare optimized function
+ if(_is_optimized)
+ {
+ _dwc_optimized_func.prepare();
+ if(!_permuted_weights.is_used())
+ {
+ _permuted_weights.allocator()->free();
+ }
+ }
+
+ _is_prepared = true;
+ }
+}
+
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
_permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(),
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
index 5f57bbfe23..b28aaa715f 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -26,7 +26,9 @@
#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -42,19 +44,22 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
const ITensor *weights,
ITensor *output,
PadStrideInfo conv_info,
- ActivationLayerInfo act_info)
+ ActivationLayerInfo act_info,
+ const Size2D &dilation)
{
+ ARM_COMPUTE_UNUSED(dilation);
const DataType data_type = input->info()->data_type();
const TensorShape shape = input->info()->tensor_shape();
- const int n_batches = shape[3];
- const int in_rows = shape.z();
- const int in_cols = shape.y();
- const int n_channels = shape.x();
- const int padding_top = conv_info.pad_top();
- const int padding_left = conv_info.pad_left();
- const int padding_bottom = conv_info.pad_bottom();
- const int padding_right = conv_info.pad_right();
+ const int n_batches = shape[3];
+ const int in_rows = shape.z();
+ const int in_cols = shape.y();
+ const int n_channels = shape.x();
+ const int dilation_factor = dilation.x();
+ const int padding_top = conv_info.pad_top();
+ const int padding_left = conv_info.pad_left();
+ const int padding_bottom = conv_info.pad_bottom();
+ const int padding_right = conv_info.pad_right();
const unsigned int stride_x = conv_info.stride().first;
@@ -95,11 +100,11 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
switch(stride_x)
{
case 1:
- return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
- n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+ return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+ n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
case 2:
- return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
- n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+ return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+ n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
default:
return nullptr;
}
@@ -115,11 +120,11 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
switch(stride_x)
{
case 1:
- return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
- n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
+ n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
case 2:
- return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
- n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
+ n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
default:
return nullptr;
}
@@ -131,11 +136,11 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
switch(stride_x)
{
case 1:
- return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
- n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
+ n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
case 2:
- return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
- n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
+ n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
default:
return nullptr;
}
@@ -148,21 +153,30 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor
}
} // namespace
+struct NEDepthwiseConvolutionAssemblyDispatch::LocalImpl
+{
+ std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel{ nullptr };
+ NEDepthwiseConvolutionAssemblyKernelWrapper _dwc_acl_kernel{};
+};
+
#ifndef DOXYGEN_SKIP_THIS
NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false), _dwc_assembly_kernel(nullptr),
- _dwc_acl_kernel()
+ : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false),
+ _pImpl(support::cpp14::make_unique<LocalImpl>())
{
}
#endif /* DOXYGEN_SKIP_THIS */
+NEDepthwiseConvolutionAssemblyDispatch::~NEDepthwiseConvolutionAssemblyDispatch() = default;
+
void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor *input,
const ITensor *weights,
const ITensor *bias,
ITensor *output,
const PadStrideInfo &conv_info,
unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info)
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_UNUSED(depth_multiplier);
@@ -172,10 +186,11 @@ void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor
output->info(),
conv_info,
depth_multiplier,
- act_info));
+ act_info,
+ dilation));
// Output auto inizialitation if not yet initialized
- const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
_input = input;
@@ -185,24 +200,24 @@ void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor
_is_prepared = false;
// Create convolver
- _dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info);
- ARM_COMPUTE_ERROR_ON(_dwc_assembly_kernel == nullptr);
+ _pImpl->_dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info, dilation);
+ ARM_COMPUTE_ERROR_ON(_pImpl->_dwc_assembly_kernel == nullptr);
// Create assembly kernel wrapper
- _dwc_acl_kernel.configure(_dwc_assembly_kernel.get());
+ _pImpl->_dwc_acl_kernel.configure(_pImpl->_dwc_assembly_kernel.get());
constexpr size_t alignment = 128;
// Create workspace
const unsigned int num_threads = NEScheduler::get().num_threads();
- const size_t workspace_size = _dwc_assembly_kernel->get_working_space_size(num_threads);
+ const size_t workspace_size = _pImpl->_dwc_assembly_kernel->get_working_space_size(num_threads);
ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
_workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
_memory_group.manage(&_workspace);
_workspace.allocator()->allocate();
// Create packing tensor
- const size_t pack_tensor_size = _dwc_assembly_kernel->get_packed_params_size();
+ const size_t pack_tensor_size = _pImpl->_dwc_assembly_kernel->get_packed_params_size();
ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
_packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
}
@@ -213,7 +228,8 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
const ITensorInfo *output,
const PadStrideInfo &conv_info,
unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info)
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -227,6 +243,7 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
ARM_COMPUTE_RETURN_ERROR_ON(!((strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2))));
ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() != dilation.y());
const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
@@ -243,7 +260,7 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
// Check output
if(output->total_size() != 0)
{
- const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -283,17 +300,17 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso
bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
// Check for supported padding
- const auto pad_top = conv_info.pad_top();
- const auto pad_right = conv_info.pad_right();
- const auto pad_bottom = conv_info.pad_bottom();
- const auto pad_left = conv_info.pad_left();
- PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
- bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
- bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
- bool supported_padding = is_same_padding || is_valid_padding;
- bool is_dilation_1 = dilation.x() == 1 && dilation.y() == 1;
-
- return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_1;
+ const auto pad_top = conv_info.pad_top();
+ const auto pad_right = conv_info.pad_right();
+ const auto pad_bottom = conv_info.pad_bottom();
+ const auto pad_left = conv_info.pad_left();
+ PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
+ bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
+ bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
+ bool supported_padding = is_same_padding || is_valid_padding;
+ bool is_dilation_supported = (dilation.x() == dilation.y()) || (dilation == Size2D(1U, 1U));
+
+ return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported;
}
void NEDepthwiseConvolutionAssemblyDispatch::run()
@@ -305,7 +322,7 @@ void NEDepthwiseConvolutionAssemblyDispatch::run()
// Setup inputs/outputs
ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
- _dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
+ _pImpl->_dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
const int input_element_size = _input->info()->element_size();
@@ -313,7 +330,7 @@ void NEDepthwiseConvolutionAssemblyDispatch::run()
const int input_row_stride = _input->info()->strides_in_bytes().z() / input_element_size;
const int input_col_stride = _input->info()->strides_in_bytes().y() / input_element_size;
const void *input_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes();
- _dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
+ _pImpl->_dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
const int output_element_size = _output->info()->element_size();
@@ -321,10 +338,10 @@ void NEDepthwiseConvolutionAssemblyDispatch::run()
const int output_row_stride = _output->info()->strides_in_bytes().z() / output_element_size;
const int output_col_stride = _output->info()->strides_in_bytes().y() / output_element_size;
void *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes();
- _dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
+ _pImpl->_dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
// Schedule assembly kernel
- NEScheduler::get().schedule(&_dwc_acl_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_pImpl->_dwc_acl_kernel, Window::DimX);
}
void NEDepthwiseConvolutionAssemblyDispatch::prepare()
@@ -338,12 +355,12 @@ void NEDepthwiseConvolutionAssemblyDispatch::prepare()
const int weights_element_size = _weights->info()->element_size();
const int weights_row_stride = _weights->info()->strides_in_bytes().z() / weights_element_size;
const int weights_col_stride = _weights->info()->strides_in_bytes().y() / weights_element_size;
- _dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
- _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
- weights_row_stride,
- weights_col_stride,
- (_bias != nullptr) ? _bias->buffer() : nullptr);
- _dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
+ _pImpl->_dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
+ _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
+ weights_row_stride,
+ weights_col_stride,
+ (_bias != nullptr) ? _bias->buffer() : nullptr);
+ _pImpl->_dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
_weights->mark_as_unused();
if(_bias != nullptr)
diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index 4c78eb87ea..440cb88ac2 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h
@@ -215,6 +215,7 @@ public:
// Stride 2
add_config(TensorShape(7U, 7U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
add_config(TensorShape(7U, 7U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL));
+ add_config(TensorShape(9U, 9U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL), Size2D(2U, 2U));
}
};
/** Dataset containing optimized, 3x3 depthwise convolution shapes. */
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 773ebdeacc..2ffe540fbc 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -156,7 +156,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
{
- bool is_valid = bool(NEDepthwiseConvolutionLayer3x3::validate(&input_info.clone()->set_is_resizable(false),
+ bool is_valid = bool(NEDepthwiseConvolutionLayerOptimized::validate(&input_info.clone()->set_is_resizable(false),
&weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
}
@@ -296,7 +296,7 @@ TEST_SUITE_END() // Generic
TEST_SUITE(W3x3)
template <typename T>
-using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
+using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayerOptimized, T>;
FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
depth_multipliers),
framework::dataset::make("DataType",
@@ -409,7 +409,7 @@ TEST_SUITE_END() // Dilation
TEST_SUITE_END() // Generic
TEST_SUITE(W3x3)
template <typename T>
-using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
+using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayerOptimized, T>;
FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
depth_multipliers),
framework::dataset::make("DataType",
@@ -480,7 +480,7 @@ TEST_SUITE_END() // FP16
TEST_SUITE_END() // Float
template <typename T>
-using NEDepthwiseConvolutionLayerQuantizedFixture3x3 = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
+using NEDepthwiseConvolutionLayerQuantizedFixture3x3 = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayerOptimized, T>;
template <typename T>
using NEDepthwiseConvolutionLayerQuantizedFixture = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;