aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h')
-rw-r--r--src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h279
1 files changed, 166 insertions, 113 deletions
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
index 8410cdbf16..3fa5c58c3c 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
@@ -24,6 +24,7 @@
#ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
#define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -63,15 +64,21 @@ struct DepthwiseConvolutionRunInfo
const size_t input_width;
const size_t input_depth;
- DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
- : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
+ DepthwiseConvolutionRunInfo(const ITensorInfo &input,
+ const ITensorInfo &weights,
+ const PadStrideInfo &conv_info,
+ const Window &w,
+ uint32_t depth_multiplier = 1) // NOLINT
+ : num_read_elements_per_iteration(
+ (depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
x_start(w.x().start()),
x_end(w.x().end()),
x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))),
input_stride_y(input.strides_in_bytes().y()),
input_stride_z(input.strides_in_bytes().z()),
- input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
+ input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) -
+ (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
weights_width(weights.dimension(width_idx)),
weights_height(weights.dimension(height_idx)),
weights_stride_y(weights.strides_in_bytes().y()),
@@ -87,7 +94,12 @@ struct DepthwiseConvolutionRunInfo
}
};
-inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
+inline bool is_valid_input_region(int32_t base_w,
+ uint32_t base_h,
+ uint32_t w,
+ uint32_t h,
+ const DepthwiseConvolutionRunInfo &run_info,
+ const Size2D &dilation)
{
const int32_t current_h = base_h + h * dilation.y();
const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
@@ -99,8 +111,14 @@ inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, u
}
template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, const Window &window, bool has_biases)
+void depthwise_loop_multiplier1_fp(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ const Window &window,
+ bool has_biases)
{
constexpr auto element_per_vector = vector_size / sizeof(T);
using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
@@ -129,94 +147,112 @@ void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, c
Iterator output_it(dst, win_output);
Iterator biases_it{};
- if(has_biases)
+ if (has_biases)
{
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
- auto const base_weights_ptr = weights_it.ptr();
- uint32_t x = run_info.x_start;
-
- for(; x < run_info.x_leftover_start; x += run_info.x_step)
+ execute_window_loop(
+ execution_window,
+ [&](const Coordinates &id)
{
- VectorType acc = zero_vector;
- auto weights_ptr = base_weights_ptr;
- int64_t input_offset = base_input_offset;
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+ auto const base_weights_ptr = weights_it.ptr();
+ uint32_t x = run_info.x_start;
- for(uint32_t h = 0; h < run_info.weights_height; ++h)
+ for (; x < run_info.x_leftover_start; x += run_info.x_step)
{
- int64_t offs = input_offset + x * sizeof(T);
- for(uint32_t w = 0; w < run_info.weights_width; ++w)
+ VectorType acc = zero_vector;
+ auto weights_ptr = base_weights_ptr;
+ int64_t input_offset = base_input_offset;
+
+ for (uint32_t h = 0; h < run_info.weights_height; ++h)
{
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_vals = is_valid_region ?
- wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
- zero_vector;
- const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
- acc = wrapper::vmla(acc, weights_vals, input_vals);
+ int64_t offs = input_offset + x * sizeof(T);
+ for (uint32_t w = 0; w < run_info.weights_width; ++w)
+ {
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_vals =
+ is_valid_region
+ ? wrapper::vload(reinterpret_cast<T *>(
+ input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+ : zero_vector;
+ const auto weights_vals =
+ wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+ acc = wrapper::vmla(acc, weights_vals, input_vals);
+
+ offs += dilation.x() * run_info.input_stride_y;
+ }
+
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
+ }
- offs += dilation.x() * run_info.input_stride_y;
+ if (has_biases)
+ {
+ const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
+ acc = wrapper::vadd(acc, biases_vals);
}
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
+ wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
}
- if(has_biases)
+ for (; x < run_info.x_end; ++x)
{
- const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
- acc = wrapper::vadd(acc, biases_vals);
- }
-
- wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
- }
+ auto acc_scalar = T{0};
+ auto weights_ptr = base_weights_ptr;
+ int64_t input_offset = base_input_offset;
- for(; x < run_info.x_end; ++x)
- {
- auto acc_scalar = T{ 0 };
- auto weights_ptr = base_weights_ptr;
- int64_t input_offset = base_input_offset;
-
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int64_t offs = input_offset + x * sizeof(T);
- for(size_t w = 0; w < run_info.weights_width; ++w)
+ for (size_t h = 0; h < run_info.weights_height; ++h)
{
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
- const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
- acc_scalar += (input_vals * weights_vals);
-
- offs += dilation.x() * run_info.input_stride_y;
+ int64_t offs = input_offset + x * sizeof(T);
+ for (size_t w = 0; w < run_info.weights_width; ++w)
+ {
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_vals =
+ is_valid_region
+ ? *reinterpret_cast<T *>(input_it.ptr() +
+ std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+ : 0;
+ const auto weights_vals =
+ *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+ acc_scalar += (input_vals * weights_vals);
+
+ offs += dilation.x() * run_info.input_stride_y;
+ }
+
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- if(has_biases)
- {
- const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
- acc_scalar += biases_vals;
+ if (has_biases)
+ {
+ const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
+ acc_scalar += biases_vals;
+ }
+ *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
}
- *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
- }
- },
- input_it, weights_it, biases_it, output_it);
+ },
+ input_it, weights_it, biases_it, output_it);
}
template <typename T>
-void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
+void depthwise_loop_generic_fp(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier,
+ const Window &window,
+ bool has_biases)
{
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+ const auto run_info =
+ DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
Window execution_window = window;
execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
@@ -240,81 +276,98 @@ void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const
Iterator output_it(dst, win_output);
Iterator biases_it{};
- if(has_biases)
+ if (has_biases)
{
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- std::vector<T> acc(depth_multiplier, static_cast<T>(0));
+ execute_window_loop(
+ execution_window,
+ [&](const Coordinates &id)
+ {
+ std::vector<T> acc(depth_multiplier, static_cast<T>(0));
- const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+ const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int offs = input_offset;
- for(size_t w = 0; w < run_info.weights_width; ++w)
+ auto weights_ptr = weights_it.ptr();
+ for (size_t h = 0; h < run_info.weights_height; ++h)
{
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
-
- for(size_t m = 0; m < depth_multiplier; ++m)
+ int offs = input_offset;
+ for (size_t w = 0; w < run_info.weights_width; ++w)
{
- const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
- acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_val =
+ is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+ run_info.input_max_offset)))
+ : T(0);
+
+ for (size_t m = 0; m < depth_multiplier; ++m)
+ {
+ const auto weights_val =
+ *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+ acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
+ }
+
+ offs += dilation.x() * run_info.input_stride_y;
}
- offs += dilation.x() * run_info.input_stride_y;
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- if(has_biases)
- {
- for(size_t m = 0; m < depth_multiplier; ++m)
+ if (has_biases)
{
- const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+ for (size_t m = 0; m < depth_multiplier; ++m)
+ {
+ const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
+ *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+ }
}
- }
- else
- {
- for(size_t m = 0; m < depth_multiplier; ++m)
+ else
{
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+ for (size_t m = 0; m < depth_multiplier; ++m)
+ {
+ *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+ }
}
- }
- },
- input_it, weights_it, biases_it, output_it);
+ },
+ input_it, weights_it, biases_it, output_it);
}
template <typename T, typename TW>
-void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void run_depthwise_float(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info)
{
PadStrideInfo conv_info = info.pad_stride_info;
unsigned int depth_multiplier = info.depth_multiplier;
Size2D dilation = info.dilation;
- if(depth_multiplier == 1)
+ if (depth_multiplier == 1)
{
depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases);
}
else
{
- depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, has_biases);
+ depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window,
+ has_biases);
}
}
template <typename T, typename TW>
-void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+void run_depthwise_quanitized8bit(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info);
} // namespace cpu
} // namespace arm_compute