aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSang-Hoon Park <sang-hoon.park@arm.com>2020-10-01 10:13:07 +0100
committerSang-Hoon Park <sang-hoon.park@arm.com>2020-10-19 08:08:24 +0000
commite4558b501bc4a8e4e731517916a29fb1594d2a78 (patch)
tree1de4af663025100a09a2f39c801eb233003a4942
parent671d4f01d96b62a24cf0688059118a1e7908650e (diff)
downloadComputeLibrary-e4558b501bc4a8e4e731517916a29fb1594d2a78.tar.gz
COMPMID-3163: Remove padding from NEDepthwiseConvolutionLayerNativeKernel
Change-Id: Ibbd6bee5c6a4ce4f212b207d17a65b9c33bcfa78 Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4106 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h19
-rw-r--r--arm_compute/core/Window.h4
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h1
-rw-r--r--src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp597
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp6
-rw-r--r--tests/validation/NEON/DepthwiseConvolutionLayer.cpp16
-rw-r--r--tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp31
7 files changed, 404 insertions, 270 deletions
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
index 2e29234b6f..335a70fc2b 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
@@ -25,7 +25,7 @@
#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/utils/misc/Requires.h"
+#include "arm_compute/core/utils/misc/Traits.h"
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include <arm_neon.h>
@@ -92,18 +92,18 @@ public:
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
- BorderSize border_size() const override;
private:
- template < typename T, typename TW, int S, typename std::enable_if < std::is_same<T, float>::value
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- || std::is_same<T, float16_t>::value
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- ,
- int >::type = 0 >
+ template <typename T>
+ using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
+
+ template <typename T, typename TW, FloatEnalber<T> = 0>
void run_depthwise(const Window &window, bool has_biases);
- template < typename T, typename TW, int S, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+ template <typename T>
+ using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
+
+ template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
void run_depthwise(const Window &window, bool has_biases);
/** Common signature for all the specialised depthwise convolution native functions
@@ -113,7 +113,6 @@ private:
using DepthwiseFunctionPtr = void (NEDepthwiseConvolutionLayerNativeKernel::*)(const Window &window, bool has_biases);
DepthwiseFunctionPtr _func;
- BorderSize _border_size;
const ITensor *_input;
const ITensor *_weights;
const ITensor *_biases;
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index 2ba5440c68..150320a90e 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,6 +45,8 @@ public:
static constexpr size_t DimY = 1;
/** Alias for dimension 2 also known as Z dimension */
static constexpr size_t DimZ = 2;
+ /** Alias for dimension 3 also known as W dimension */
+ static constexpr size_t DimW = 3;
/** Default constructor: create a window containing a single element. */
constexpr Window()
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 3b75bb11a7..c6b98ed435 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -249,7 +249,6 @@ private:
private:
NEDepthwiseConvolutionLayerNativeKernel _depthwise_conv_kernel;
- NEFillBorderKernel _fill_border;
NEPermute _permute_input;
NEPermute _permute_weights;
NEPermute _permute_output;
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
index fb47879b17..0a34ee6a07 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
@@ -36,46 +36,103 @@ namespace arm_compute
{
namespace
{
-void pad_vectors(std::vector<int> &mult, std::vector<int> &shift, int vec_size)
+constexpr auto data_layout = DataLayout::NHWC;
+const size_t batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0);
+constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1);
+constexpr size_t vector_size = 8;
+
+struct DepthwiseConvolutionRunInfo
{
- ARM_COMPUTE_ERROR_ON(mult.size() != shift.size());
- while(mult.size() % vec_size != 0)
+public:
+ const size_t num_read_elements_per_iteration;
+ const uint32_t x_start;
+ const uint32_t x_end;
+ const uint32_t x_step;
+ const uint32_t x_leftover_start;
+ const size_t input_stride_y;
+ const size_t input_stride_z;
+ const size_t input_max_offset;
+ const size_t weights_width;
+ const size_t weights_height;
+ const size_t weights_stride_y;
+ const size_t weights_stride_z;
+ const size_t conv_stride_x;
+ const size_t conv_stride_y;
+ const size_t conv_pad_left;
+ const size_t conv_pad_top;
+ const size_t input_height;
+ const size_t input_width;
+ const size_t input_depth;
+
+ DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
+ : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
+ x_start(w.x().start()),
+ x_end(w.x().end()),
+ x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
+ x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
+ input_stride_y(input.strides_in_bytes().y()),
+ input_stride_z(input.strides_in_bytes().z()),
+ input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
+ weights_width(weights.dimension(width_idx)),
+ weights_height(weights.dimension(height_idx)),
+ weights_stride_y(weights.strides_in_bytes().y()),
+ weights_stride_z(weights.strides_in_bytes().z()),
+ conv_stride_x(conv_info.stride().first),
+ conv_stride_y(conv_info.stride().second),
+ conv_pad_left(conv_info.pad_left()),
+ conv_pad_top(conv_info.pad_top()),
+ input_height(input.dimension(height_idx)),
+ input_width(input.dimension(width_idx)),
+ input_depth(input.dimension(channel_idx))
{
- mult.push_back(0);
- shift.push_back(0);
}
+};
+
+inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
+{
+ const int32_t current_h = base_h + h * dilation.y();
+ const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
+
+ const int32_t current_w = base_w + w * dilation.x();
+ const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
+
+ return is_valid_h && is_valid_w;
}
-template <typename T, int S>
+template <typename T>
void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
const Size2D &dilation, const Window &window, bool has_biases)
{
- using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
- using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
- const size_t input_stride_y = input->info()->strides_in_bytes().y();
- const size_t input_stride_z = input->info()->strides_in_bytes().z();
- const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
- input->info()->strides_in_bytes().y();
- const size_t weights_width = weights->info()->dimension(1);
- const size_t weights_height = weights->info()->dimension(2);
- const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
- const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
- const size_t conv_stride_x = conv_info.stride().first;
- const size_t conv_stride_y = conv_info.stride().second;
- const size_t conv_pad_left = conv_info.pad_left();
- const size_t conv_pad_top = conv_info.pad_top();
+ constexpr auto element_per_vector = vector_size / sizeof(T);
+ using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+ using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+
+ const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
+
+ const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
+
+ Window execution_window = window;
+ execution_window.set(Window::DimX, dim_single_unit_step);
Window win_input = window;
- win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ win_input.set(Window::DimX, dim_manual_loop);
+ win_input.set(Window::DimY, dim_manual_loop);
+ win_input.set(Window::DimZ, dim_manual_loop);
Window win_weights = win_input;
- win_weights.set(3, Window::Dimension(0, 0, 0));
+ win_weights.set(Window::DimW, dim_manual_loop);
+
+ Window win_output = window;
+ win_output.set(Window::DimX, dim_manual_loop);
Iterator input_it(input, win_input);
Iterator weights_it(weights, win_weights);
- Iterator output_it(output, window);
+ Iterator output_it(output, win_output);
Iterator biases_it{};
if(has_biases)
@@ -83,38 +140,80 @@ void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights,
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(execution_window, [&](const Coordinates & id)
{
- VectorType acc = wrapper::vdup_n(static_cast<T>(0), TagType{});
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
- const int input_y = id.y() * conv_stride_x - conv_pad_left;
- const int input_z = id.z() * conv_stride_y - conv_pad_top;
- int input_offset = input_y * input_stride_y + input_z * input_stride_z;
+ auto const base_weights_ptr = weights_it.ptr();
+ uint32_t x = run_info.x_start;
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < weights_height; ++h)
+ for(; x < run_info.x_leftover_start; x += run_info.x_step)
{
- int offs = input_offset;
- for(size_t w = 0; w < weights_width; ++w)
+ VectorType acc = zero_vector;
+ auto weights_ptr = base_weights_ptr;
+ int64_t input_offset = base_input_offset;
+
+ for(uint32_t h = 0; h < run_info.weights_height; ++h)
{
- const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
- const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * weights_stride_y));
+ int64_t offs = input_offset + x * sizeof(T);
+ for(uint32_t w = 0; w < run_info.weights_width; ++w)
+ {
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_vals = is_valid_region ?
+ wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
+ zero_vector;
+ const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+ acc = wrapper::vmla(acc, weights_vals, input_vals);
+
+ offs += dilation.x() * run_info.input_stride_y;
+ }
+
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
+ }
- acc = wrapper::vmla(acc, weights_vals, input_vals);
- offs += dilation.x() * input_stride_y;
+ if(has_biases)
+ {
+ const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
+ acc = wrapper::vadd(acc, biases_vals);
}
- weights_ptr += weights_stride_z;
- input_offset += dilation.y() * input_stride_z;
+ wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
}
- if(has_biases)
+ for(; x < run_info.x_end; ++x)
{
- const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()));
- acc = wrapper::vadd(acc, biases_vals);
- }
+ auto acc_scalar = T{ 0 };
+ auto weights_ptr = base_weights_ptr;
+ int64_t input_offset = base_input_offset;
+
+ for(size_t h = 0; h < run_info.weights_height; ++h)
+ {
+ int64_t offs = input_offset + x * sizeof(T);
+ for(size_t w = 0; w < run_info.weights_width; ++w)
+ {
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
+ const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+ acc_scalar += (input_vals * weights_vals);
+
+ offs += dilation.x() * run_info.input_stride_y;
+ }
+
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
+ }
- wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), acc);
+ if(has_biases)
+ {
+ const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
+ acc_scalar += biases_vals;
+ }
+ *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
+ }
},
input_it, weights_it, biases_it, output_it);
}
@@ -123,31 +222,28 @@ template <typename T>
void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
{
- const size_t input_stride_y = input->info()->strides_in_bytes().y();
- const size_t input_stride_z = input->info()->strides_in_bytes().z();
- const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
- input->info()->strides_in_bytes().y();
- const size_t weights_width = weights->info()->dimension(1);
- const size_t weights_height = weights->info()->dimension(2);
- const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
- const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
- const size_t conv_stride_x = conv_info.stride().first;
- const size_t conv_stride_y = conv_info.stride().second;
- const size_t conv_pad_left = conv_info.pad_left();
- const size_t conv_pad_top = conv_info.pad_top();
+ const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
- Window win_input = window;
- win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ Window execution_window = window;
+ execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
- Window win_weights = win_input;
- win_weights.set(3, Window::Dimension(0, 0, 0));
+ Window win_input = execution_window;
+ win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+ win_input.set(Window::DimY, dim_manual_loop);
+ win_input.set(Window::DimZ, dim_manual_loop);
- win_input.set_dimension_step(Window::DimX, 1);
+ Window win_weights = window;
+ win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+ win_weights.set(Window::DimY, dim_manual_loop);
+ win_weights.set(Window::DimZ, dim_manual_loop);
+ win_weights.set(Window::DimW, dim_manual_loop);
+
+ Window win_output = window;
+ win_output.set_dimension_step(Window::DimX, run_info.x_step);
Iterator input_it(input, win_input);
Iterator weights_it(weights, win_weights);
- Iterator output_it(output, window);
+ Iterator output_it(output, win_output);
Iterator biases_it{};
if(has_biases)
@@ -155,33 +251,34 @@ void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, con
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(execution_window, [&](const Coordinates & id)
{
std::vector<T> acc(depth_multiplier, static_cast<T>(0));
- const int input_y = id.y() * conv_stride_x - conv_pad_left;
- const int input_z = id.z() * conv_stride_y - conv_pad_top;
- int input_offset = input_y * input_stride_y + input_z * input_stride_z;
+ const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < weights_height; ++h)
+ for(size_t h = 0; h < run_info.weights_height; ++h)
{
int offs = input_offset;
- for(size_t w = 0; w < weights_width; ++w)
+ for(size_t w = 0; w < run_info.weights_width; ++w)
{
- const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
for(size_t m = 0; m < depth_multiplier; ++m)
{
- const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
+ const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
}
- offs += dilation.x() * input_stride_y;
+ offs += dilation.x() * run_info.input_stride_y;
}
- weights_ptr += weights_stride_z;
- input_offset += dilation.y() * input_stride_z;
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
if(has_biases)
@@ -203,41 +300,43 @@ void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, con
input_it, weights_it, biases_it, output_it);
}
-template <typename T, typename TW, int S>
+template <typename T, typename TW>
void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
{
- using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
- using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
- const size_t input_stride_y = input->info()->strides_in_bytes().y();
- const size_t input_stride_z = input->info()->strides_in_bytes().z();
- const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
- input->info()->strides_in_bytes().y();
- const size_t weights_width = weights->info()->dimension(1);
- const size_t weights_height = weights->info()->dimension(2);
- const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
- const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
- const size_t conv_stride_x = conv_info.stride().first;
- const size_t conv_stride_y = conv_info.stride().second;
- const size_t conv_pad_left = conv_info.pad_left();
- const size_t conv_pad_top = conv_info.pad_top();
+ constexpr auto element_per_vector = vector_size / sizeof(T);
+ using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+ using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+ using AccType = int32_t;
+ using AccArrayType = std::array<AccType, element_per_vector>;
+
+ const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
+ const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
+
+ const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
- const int32_t k_offset = weights_width * weights_height * input_qoffset * weights_qoffset;
+ const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
+
+ Window execution_window = window;
+ execution_window.set(Window::DimX, dim_single_unit_step);
Window win_input = window;
- win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ win_input.set(Window::DimX, dim_manual_loop);
+ win_input.set(Window::DimY, dim_manual_loop);
+ win_input.set(Window::DimZ, dim_manual_loop);
Window win_weights = win_input;
- win_weights.set(3, Window::Dimension(0, 0, 0));
+ win_weights.set(Window::DimW, dim_manual_loop);
+
+ Window win_output = window;
+ win_output.set(Window::DimX, dim_manual_loop);
Iterator input_it(input, win_input);
Iterator weights_it(weights, win_weights);
- Iterator output_it(output, window);
+ Iterator output_it(output, win_output);
Iterator biases_it{};
if(has_biases)
@@ -245,65 +344,134 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(execution_window, [&](const Coordinates & id)
{
- std::vector<int32_t> acc(S, 0);
- std::vector<int32_t> in_sum(S, 0);
- std::vector<int32_t> we_sum(S, 0);
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+ auto const base_weights_ptr = weights_it.ptr();
+ size_t x = run_info.x_start;
- const int input_y = id.y() * conv_stride_x - conv_pad_left;
- const int input_z = id.z() * conv_stride_y - conv_pad_top;
- int input_offset = input_y * input_stride_y + input_z * input_stride_z;
-
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < weights_height; ++h)
+ for(; x < run_info.x_leftover_start; x += run_info.x_step)
{
- int offs = input_offset;
- for(size_t w = 0; w < weights_width; ++w)
+ AccArrayType acc{};
+ AccArrayType in_sum{};
+ AccArrayType we_sum{};
+
+ auto weights_ptr = base_weights_ptr;
+ auto input_offset = base_input_offset;
+
+ for(size_t h = 0; h < run_info.weights_height; ++h)
+ {
+ int64_t offs = input_offset + x * sizeof(T);
+ for(size_t w = 0; w < run_info.weights_width; ++w)
+ {
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_vals = is_valid_region ?
+ wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
+ out_of_bound_vector;
+ const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+ for(size_t i = 0; i < run_info.x_step; ++i)
+ {
+ acc.at(i) += input_vals[i] * weights_vals[i];
+ in_sum.at(i) += input_vals[i];
+ we_sum.at(i) += weights_vals[i];
+ }
+
+ offs += dilation.x() * run_info.input_stride_y;
+ }
+
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
+ }
+
+ VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
+ for(size_t i = 0; i < run_info.x_step; ++i)
{
- const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
- const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * weights_stride_y));
+ acc.at(i) -= in_sum.at(i) * weights_qoffset;
+ acc.at(i) -= we_sum.at(i) * input_qoffset;
+ acc.at(i) += k_offset;
- for(int i = 0; i < S; ++i)
+ if(has_biases)
{
- acc.at(i) += input_vals[i] * weights_vals[i];
- in_sum.at(i) += input_vals[i];
- we_sum.at(i) += weights_vals[i];
+ acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
}
- offs += dilation.x() * input_stride_y;
+ const int32_t out_mul = output_multiplier.at(x + i);
+ const int32_t out_shift = output_shift.at(x + i);
+ if(out_shift < 0)
+ {
+ acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+ }
+ else
+ {
+ acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
+ }
+ out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
}
- weights_ptr += weights_stride_z;
- input_offset += dilation.y() * input_stride_z;
+ wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
}
- VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
- for(int i = 0; i < S; ++i)
+ // left-over
+ for(; x < run_info.x_end; ++x)
{
- acc.at(i) -= in_sum.at(i) * weights_qoffset;
- acc.at(i) -= we_sum.at(i) * input_qoffset;
- acc.at(i) += k_offset;
+ AccType acc = 0;
+ AccType in_sum = 0;
+ AccType we_sum = 0;
+
+ auto weights_ptr = base_weights_ptr;
+ auto input_offset = base_input_offset;
+
+ for(size_t h = 0; h < run_info.weights_height; ++h)
+ {
+ int64_t offs = input_offset + x * sizeof(T);
+ for(size_t w = 0; w < run_info.weights_width; ++w)
+ {
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_val = is_valid_region ?
+ *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
+ out_of_bound_value;
+ const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+ acc += input_val * weights_val;
+ in_sum += input_val;
+ we_sum += weights_val;
+
+ offs += dilation.x() * run_info.input_stride_y;
+ }
+
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
+ }
+
+ T out_vals{ 0 };
+
+ acc -= in_sum * weights_qoffset;
+ acc -= we_sum * input_qoffset;
+ acc += k_offset;
if(has_biases)
{
- acc.at(i) += *reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t));
+ acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
}
- const int out_mul = output_multiplier.at(id.x() + i);
- const int out_shift = output_shift.at(id.x() + i);
+ const int32_t out_mul = output_multiplier.at(x);
+ const int32_t out_shift = output_shift.at(x);
+
if(out_shift < 0)
{
- acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+ acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
}
else
{
- acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
+ acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
}
- out_vals[i] = static_cast<T>(utility::clamp<int32_t, T>(acc.at(i)));
- }
- wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), out_vals);
+ out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
+ *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
+ }
},
input_it, weights_it, biases_it, output_it);
}
@@ -312,36 +480,36 @@ template <typename T, typename TW>
void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
{
- const size_t input_stride_y = input->info()->strides_in_bytes().y();
- const size_t input_stride_z = input->info()->strides_in_bytes().z();
- const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
- input->info()->strides_in_bytes().y();
- const size_t weights_width = weights->info()->dimension(1);
- const size_t weights_height = weights->info()->dimension(2);
- const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
- const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
- const size_t conv_stride_x = conv_info.stride().first;
- const size_t conv_stride_y = conv_info.stride().second;
- const size_t conv_pad_left = conv_info.pad_left();
- const size_t conv_pad_top = conv_info.pad_top();
+ using AccType = int32_t;
+
+ const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+ const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
- const int32_t k_offset = weights_width * weights_height * input_qoffset * weights_qoffset;
+ const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
- Window win_input = window;
- win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ Window execution_window = window;
+ execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
- Window win_weights = win_input;
- win_weights.set(3, Window::Dimension(0, 0, 0));
+ Window win_input = execution_window;
+ win_input.set(Window::DimY, dim_manual_loop);
+ win_input.set(Window::DimZ, dim_manual_loop);
- win_input.set_dimension_step(Window::DimX, 1);
+ Window win_weights = window;
+ win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+ win_weights.set(Window::DimY, dim_manual_loop);
+ win_weights.set(Window::DimZ, dim_manual_loop);
+ win_weights.set(Window::DimW, dim_manual_loop);
+
+ Window win_output = window;
+ win_output.set_dimension_step(Window::DimX, run_info.x_step);
Iterator input_it(input, win_input);
Iterator weights_it(weights, win_weights);
- Iterator output_it(output, window);
+ Iterator output_it(output, win_output);
Iterator biases_it{};
if(has_biases)
@@ -349,38 +517,39 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(execution_window, [&](const Coordinates & id)
{
- std::vector<int32_t> acc(depth_multiplier, 0);
- std::vector<int32_t> we_sum(depth_multiplier, 0);
- int32_t in_sum = 0;
+ std::vector<AccType> acc(depth_multiplier, 0);
+ std::vector<AccType> we_sum(depth_multiplier, 0);
+ AccType in_sum = 0;
- const int input_y = id.y() * conv_stride_x - conv_pad_left;
- const int input_z = id.z() * conv_stride_y - conv_pad_top;
- int input_offset = input_y * input_stride_y + input_z * input_stride_z;
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < weights_height; ++h)
+ for(size_t h = 0; h < run_info.weights_height; ++h)
{
int offs = input_offset;
- for(size_t w = 0; w < weights_width; ++w)
+ for(size_t w = 0; w < run_info.weights_width; ++w)
{
- const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
for(size_t m = 0; m < depth_multiplier; ++m)
{
- const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
+ const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
acc.at(m) += input_val * weights_val;
we_sum.at(m) += weights_val;
}
- offs += dilation.x() * input_stride_y;
+ offs += dilation.x() * run_info.input_stride_y;
in_sum += input_val;
}
- weights_ptr += weights_stride_z;
- input_offset += dilation.y() * input_stride_z;
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
for(size_t m = 0; m < depth_multiplier; ++m)
@@ -394,8 +563,8 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
}
- const int out_mul = output_multiplier.at(id.x() + m);
- const int out_shift = output_shift.at(id.x() + m);
+ const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
+ const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
if(out_shift < 0)
{
acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
@@ -404,7 +573,7 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
{
acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
}
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<int32_t, T>(acc.at(m)));
+ *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
}
},
input_it, weights_it, biases_it, output_it);
@@ -458,54 +627,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *biases,
- ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const Size2D &dilation)
-{
- // Get convolved dimensions
- const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
- // Configure kernel window (generic)
- const unsigned int num_elems_read_per_iteration = (depth_multiplier == 1) ? 8 / element_size_from_data_type(input->data_type()) : 1;
- const unsigned int num_elems_written_per_iteration = num_elems_read_per_iteration * depth_multiplier;
-
- // Configure kernel window
- Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
-
- AccessWindowStatic input_access(input, 0, -conv_info.pad_left(), ceil_to_multiple(num_elems_read_per_iteration, input->dimension(0)),
- input->dimension(1) + std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()));
- AccessWindowHorizontal weights_access(weights, 0, num_elems_written_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-
- if(biases != nullptr)
- {
- AccessWindowHorizontal biases_access(biases, 0, num_elems_written_per_iteration);
- window_changed |= update_window_and_padding(win, biases_access);
- }
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} // namespace
NEDepthwiseConvolutionLayerNativeKernel::NEDepthwiseConvolutionLayerNativeKernel()
- : _func(), _border_size(0), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
+ : _func(), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
{
}
-BorderSize NEDepthwiseConvolutionLayerNativeKernel::border_size() const
-{
- return _border_size;
-}
-
void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
{
@@ -518,7 +646,6 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co
_output = output;
_conv_info = conv_info;
_depth_multiplier = depth_multiplier;
- _border_size = BorderSize(_conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
_dilation = dilation;
_has_biases = (biases != nullptr);
@@ -530,17 +657,17 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co
auto weights_scale = weights->info()->quantization_info().scale();
if(!is_data_type_quantized_per_channel(_weights->info()->data_type()))
{
- for(size_t i = 1; i < _weights->info()->dimension(0); ++i)
+ for(size_t i = 1; i < _weights->info()->dimension(channel_idx); ++i)
{
weights_scale.push_back(weights_scale.front());
}
}
- for(size_t i = 0; i < weights_scale.size(); ++i)
+ for(const auto &s : weights_scale)
{
int32_t out_mult = 0;
int32_t out_shift = 0;
- const float multiplier = input_scale * weights_scale.at(i) / output_scale;
+ const float multiplier = input_scale * s / output_scale;
arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
_output_multiplier.push_back(out_mult);
@@ -551,42 +678,42 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co
switch(_weights->info()->data_type())
{
case DataType::QASYMM8:
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t, 8>;
- pad_vectors(_output_multiplier, _output_shift, 8);
+ _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t>;
break;
case DataType::QASYMM8_SIGNED:
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8>;
- pad_vectors(_output_multiplier, _output_shift, 8);
+ _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
break;
case DataType::QSYMM8_PER_CHANNEL:
if(_input->info()->data_type() == DataType::QASYMM8)
{
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t, 8>;
+ _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t>;
}
else
{
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8>;
+ _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
}
- pad_vectors(_output_multiplier, _output_shift, 8);
break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t, 4>;
- pad_vectors(_output_multiplier, _output_shift, 4);
+ _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t>;
break;
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float, 2>;
- pad_vectors(_output_multiplier, _output_shift, 2);
+ _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float>;
break;
default:
ARM_COMPUTE_ERROR("Data type not supported");
break;
}
- auto win_config = validate_and_configure_window(_input->info(), _weights->info(), (biases != nullptr) ? biases->info() : nullptr, _output->info(), _conv_info, _depth_multiplier, dilation);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+
+ Window win = calculate_max_window(*output->info(), Steps());
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+ INEKernel::configure(win);
}
Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -594,9 +721,6 @@ Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *inpu
const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), (biases != nullptr) ? biases->clone().get() : nullptr, output->clone().get(), conv_info,
- depth_multiplier, dilation)
- .first);
return Status{};
}
@@ -609,12 +733,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::run(const Window &window, const Th
(this->*_func)(window, _has_biases);
}
-template < typename T, typename TW, int S, typename std::enable_if < std::is_same<T, float>::value
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- || std::is_same<T, float16_t>::value
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- ,
- int >::type >
+template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::FloatEnalber<T>>
void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -622,7 +741,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window
if(_depth_multiplier == 1)
{
- depthwise_loop_multiplier1_fp<T, S>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);
+ depthwise_loop_multiplier1_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);
}
else
{
@@ -630,7 +749,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window
}
}
-template <typename T, typename TW, int S, typename>
+template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::Quantized8bitEnalber<T>>
void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -638,7 +757,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window
if(_depth_multiplier == 1)
{
- depthwise_loop_multiplier1_quantized<T, TW, S>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
+ depthwise_loop_multiplier1_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
}
else
{
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 915a2830bf..6c22523bcb 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -210,8 +210,8 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
}
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
- : _depthwise_conv_kernel(), _fill_border(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(),
- _is_prepared(false), _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+ : _depthwise_conv_kernel(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false),
+ _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
{
}
@@ -244,7 +244,6 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
_original_weights = weights_to_use;
_depthwise_conv_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
- _fill_border.configure(input_to_use, _depthwise_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()));
if(_is_nchw)
{
@@ -310,7 +309,6 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
_permute_input.run();
}
- NEScheduler::get().schedule(&_fill_border, Window::DimX);
NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY);
if(_is_nchw)
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 407ebe362a..e255fc7b4d 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -177,8 +177,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid output size
TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32), // Patch size bigger than input width
TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32), // Dilation < 1
- TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32), // Window shrinking
- TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::QASYMM8), // Window shrinking
}),
framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
@@ -188,8 +186,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
- TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
- TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8),
})),
framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
TensorInfo(TensorShape(2U), 1, DataType::F32),
@@ -199,8 +195,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
TensorInfo(TensorShape(2U), 1, DataType::F32),
TensorInfo(TensorShape(16U), 1, DataType::F32),
TensorInfo(TensorShape(16U), 1, DataType::F32),
- TensorInfo(TensorShape(16U), 1, DataType::F32),
- TensorInfo(TensorShape(24U), 1, DataType::S32),
})),
framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
@@ -210,8 +204,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
- TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
- TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::QASYMM8),
})),
framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
PadStrideInfo(1, 1, 0, 0),
@@ -221,8 +213,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
PadStrideInfo(1, 1, 0, 0),
PadStrideInfo(1, 1, 0, 0),
PadStrideInfo(1, 1, 0, 0),
- PadStrideInfo(1, 1, 0, 0),
- PadStrideInfo(1, 1, 1, 0),
})),
framework::dataset::make("DepthMultiplier", { 1,
1,
@@ -232,8 +222,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
1,
2,
2,
- 2,
- 3,
})),
framework::dataset::make("Dilation", { Size2D(1U, 1U),
Size2D(1U, 1U),
@@ -243,10 +231,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
Size2D(1U, 1U),
Size2D(25U, 1U),
Size2D(0U, 1U),
- Size2D(1U, 1U),
- Size2D(1U, 1U),
})),
- framework::dataset::make("Expected", { false, false, false, false, false, false,false, false, false, false })),
+ framework::dataset::make("Expected", { false, false, false, false, false, false, false, false})),
input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
{
bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
index be0eee32e0..47551355bb 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
@@ -102,6 +102,37 @@ const auto data_layout_values = framework::dataset::make("data_layout", { DataLa
TEST_SUITE(NEON)
TEST_SUITE(DepthwiseConvolutionLayerNative)
+
+TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
+{
+ // this test case will ensure that the kernel is not adding implicit padding
+ constexpr uint32_t vector_size = 8; // Asummed vector size of the current native kernel
+ constexpr auto depth = vector_size * 2 + 1; // mis-aligned depth to force padding if exists.
+ constexpr auto data_layout = DataLayout::NHWC;
+ constexpr auto data_type = DataType::F32;
+
+ const auto input_size = Size2D{ 100, 100 }; // random plane size of the input
+ const auto kernel_size = Size2D{ 4, 4 }; // random plane size of the kernel
+ const auto pad_stride_info = PadStrideInfo(3, 3); // random convolution information to
+
+ TensorShape src_shape{ depth, input_size.x(), input_size.y() };
+ TensorShape weights_shape{ depth, kernel_size.x(), kernel_size.y() };
+ TensorShape bias_shape{ depth };
+
+ auto src = create_tensor<Tensor>(src_shape, data_type, 1, QuantizationInfo(), data_layout);
+ auto weights = create_tensor<Tensor>(weights_shape, data_type, 1, QuantizationInfo(), data_layout);
+ auto biases = create_tensor<Tensor>(bias_shape, data_type, 1, QuantizationInfo(), data_layout);
+ auto dst = create_tensor<Tensor>(TensorShape(), data_type, 1, QuantizationInfo(), data_layout);
+
+ NEDepthwiseConvolutionLayerNativeKernel dwc;
+ dwc.configure(&src, &weights, &biases, &dst, pad_stride_info);
+
+ ARM_COMPUTE_EXPECT(src.info()->padding().empty(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(weights.info()->padding().empty(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(biases.info()->padding().empty(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(dst.info()->padding().empty(), framework::LogLevel::ERRORS);
+}
+
TEST_SUITE(Float)
TEST_SUITE(FP32)
FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,