aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp')
-rw-r--r--src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp592
1 files changed, 344 insertions, 248 deletions
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
index a2ae5564e6..d08e973968 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
+
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/function_info/ConvolutionInfo.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -65,8 +67,16 @@ inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent)
namespace
{
template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_multiplier1_quantized(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ std::vector<int> output_multiplier,
+ std::vector<int> output_shift,
+ const Window &window,
+ bool has_biases) // NOLINT
{
ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
constexpr auto element_per_vector = vector_size / sizeof(T);
@@ -75,7 +85,8 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei
using AccType = int32_t;
using AccArrayType = std::array<AccType, element_per_vector>;
- const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+ const auto out_of_bound_value =
+ PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
@@ -104,152 +115,175 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei
Iterator output_it(dst, win_output);
Iterator biases_it{};
- if(has_biases)
+ if (has_biases)
{
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
- auto const base_weights_ptr = weights_it.ptr();
- size_t x = run_info.x_start;
-
- for(; x < run_info.x_leftover_start; x += run_info.x_step)
+ execute_window_loop(
+ execution_window,
+ [&](const Coordinates &id)
{
- AccArrayType acc{};
- AccArrayType in_sum{};
- AccArrayType we_sum{};
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+ auto const base_weights_ptr = weights_it.ptr();
+ size_t x = run_info.x_start;
- auto weights_ptr = base_weights_ptr;
- auto input_offset = base_input_offset;
-
- for(size_t h = 0; h < run_info.weights_height; ++h)
+ for (; x < run_info.x_leftover_start; x += run_info.x_step)
{
- int64_t offs = input_offset + x * sizeof(T);
- for(size_t w = 0; w < run_info.weights_width; ++w)
- {
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_vals = is_valid_region ?
- wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
- out_of_bound_vector;
- const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+ AccArrayType acc{};
+ AccArrayType in_sum{};
+ AccArrayType we_sum{};
+
+ auto weights_ptr = base_weights_ptr;
+ auto input_offset = base_input_offset;
- for(size_t i = 0; i < element_per_vector; ++i)
+ for (size_t h = 0; h < run_info.weights_height; ++h)
+ {
+ int64_t offs = input_offset + x * sizeof(T);
+ for (size_t w = 0; w < run_info.weights_width; ++w)
{
- acc.at(i) += input_vals[i] * weights_vals[i];
- in_sum.at(i) += input_vals[i];
- we_sum.at(i) += weights_vals[i];
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_vals =
+ is_valid_region
+ ? wrapper::vload(reinterpret_cast<T *>(
+ input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+ : out_of_bound_vector;
+ const auto weights_vals =
+ wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+ for (size_t i = 0; i < element_per_vector; ++i)
+ {
+ acc.at(i) += input_vals[i] * weights_vals[i];
+ in_sum.at(i) += input_vals[i];
+ we_sum.at(i) += weights_vals[i];
+ }
+
+ offs += dilation.x() * run_info.input_stride_y;
}
- offs += dilation.x() * run_info.input_stride_y;
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
+ VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
+ for (size_t i = 0; i < element_per_vector; ++i)
+ {
+ acc.at(i) -= in_sum.at(i) * weights_qoffset;
+ acc.at(i) -= we_sum.at(i) * input_qoffset;
+ acc.at(i) += k_offset;
- VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
- for(size_t i = 0; i < element_per_vector; ++i)
- {
- acc.at(i) -= in_sum.at(i) * weights_qoffset;
- acc.at(i) -= we_sum.at(i) * input_qoffset;
- acc.at(i) += k_offset;
+ if (has_biases)
+ {
+ acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+ }
- if(has_biases)
- {
- acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+ const int32_t out_mul = output_multiplier.at(x + i);
+ const int32_t out_shift = output_shift.at(x + i);
+ if (out_shift < 0)
+ {
+ acc.at(i) =
+ saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+ }
+ else
+ {
+ acc.at(i) =
+ rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) +
+ output_qoffset;
+ }
+ out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
}
- const int32_t out_mul = output_multiplier.at(x + i);
- const int32_t out_shift = output_shift.at(x + i);
- if(out_shift < 0)
- {
- acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
- }
- else
- {
- acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
- }
- out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
+ wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
}
- wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
- }
-
- // left-over
- for(; x < run_info.x_end; ++x)
- {
- AccType acc = 0;
- AccType in_sum = 0;
- AccType we_sum = 0;
+ // left-over
+ for (; x < run_info.x_end; ++x)
+ {
+ AccType acc = 0;
+ AccType in_sum = 0;
+ AccType we_sum = 0;
- auto weights_ptr = base_weights_ptr;
- auto input_offset = base_input_offset;
+ auto weights_ptr = base_weights_ptr;
+ auto input_offset = base_input_offset;
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int64_t offs = input_offset + x * sizeof(T);
- for(size_t w = 0; w < run_info.weights_width; ++w)
+ for (size_t h = 0; h < run_info.weights_height; ++h)
{
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_val = is_valid_region ?
- *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
- out_of_bound_value;
- const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
- acc += input_val * weights_val;
- in_sum += input_val;
- we_sum += weights_val;
+ int64_t offs = input_offset + x * sizeof(T);
+ for (size_t w = 0; w < run_info.weights_width; ++w)
+ {
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_val =
+ is_valid_region
+ ? *reinterpret_cast<T *>(input_it.ptr() +
+ std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+ : out_of_bound_value;
+ const auto weights_val =
+ *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+ acc += input_val * weights_val;
+ in_sum += input_val;
+ we_sum += weights_val;
+
+ offs += dilation.x() * run_info.input_stride_y;
+ }
- offs += dilation.x() * run_info.input_stride_y;
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
+ T out_vals{0};
- T out_vals{ 0 };
+ acc -= in_sum * weights_qoffset;
+ acc -= we_sum * input_qoffset;
+ acc += k_offset;
- acc -= in_sum * weights_qoffset;
- acc -= we_sum * input_qoffset;
- acc += k_offset;
+ if (has_biases)
+ {
+ acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
+ }
- if(has_biases)
- {
- acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
- }
+ const int32_t out_mul = output_multiplier.at(x);
+ const int32_t out_shift = output_shift.at(x);
- const int32_t out_mul = output_multiplier.at(x);
- const int32_t out_shift = output_shift.at(x);
+ if (out_shift < 0)
+ {
+ acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
+ }
+ else
+ {
+ acc =
+ rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+ }
- if(out_shift < 0)
- {
- acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
- }
- else
- {
- acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+ out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
+ *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
}
-
- out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
- *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
- }
- },
- input_it, weights_it, biases_it, output_it);
+ },
+ input_it, weights_it, biases_it, output_it);
}
template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_generic_quantized(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier,
+ std::vector<int> output_multiplier,
+ std::vector<int> output_shift,
+ const Window &window,
+ bool has_biases) // NOLINT
{
using AccType = int32_t;
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+ const auto run_info =
+ DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
- const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+ const auto out_of_bound_value =
+ PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
@@ -277,76 +311,93 @@ void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights
Iterator output_it(dst, win_output);
Iterator biases_it{};
- if(has_biases)
+ if (has_biases)
{
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- std::vector<AccType> acc(depth_multiplier, 0);
- std::vector<AccType> we_sum(depth_multiplier, 0);
- AccType in_sum = 0;
+ execute_window_loop(
+ execution_window,
+ [&](const Coordinates &id)
+ {
+ std::vector<AccType> acc(depth_multiplier, 0);
+ std::vector<AccType> we_sum(depth_multiplier, 0);
+ AccType in_sum = 0;
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int offs = input_offset;
- for(size_t w = 0; w < run_info.weights_width; ++w)
+ auto weights_ptr = weights_it.ptr();
+ for (size_t h = 0; h < run_info.weights_height; ++h)
{
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
-
- for(size_t m = 0; m < depth_multiplier; ++m)
+ int offs = input_offset;
+ for (size_t w = 0; w < run_info.weights_width; ++w)
{
- const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
- acc.at(m) += input_val * weights_val;
-
- we_sum.at(m) += weights_val;
- }
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_val =
+ is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+ run_info.input_max_offset)))
+ : out_of_bound_value;
- offs += dilation.x() * run_info.input_stride_y;
- in_sum += input_val;
- }
+ for (size_t m = 0; m < depth_multiplier; ++m)
+ {
+ const auto weights_val =
+ *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+ acc.at(m) += input_val * weights_val;
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
+ we_sum.at(m) += weights_val;
+ }
- for(size_t m = 0; m < depth_multiplier; ++m)
- {
- acc.at(m) -= in_sum * weights_qoffset;
- acc.at(m) -= we_sum.at(m) * input_qoffset;
- acc.at(m) += k_offset;
+ offs += dilation.x() * run_info.input_stride_y;
+ in_sum += input_val;
+ }
- if(has_biases)
- {
- acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
- const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
- const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
- if(out_shift < 0)
- {
- acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
- }
- else
+ for (size_t m = 0; m < depth_multiplier; ++m)
{
- acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
+ acc.at(m) -= in_sum * weights_qoffset;
+ acc.at(m) -= we_sum.at(m) * input_qoffset;
+ acc.at(m) += k_offset;
+
+ if (has_biases)
+ {
+ acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+ }
+
+ const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
+ const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
+ if (out_shift < 0)
+ {
+ acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
+ }
+ else
+ {
+ acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) +
+ output_qoffset;
+ }
+ *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) =
+ static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
}
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
- }
- },
- input_it, weights_it, biases_it, output_it);
+ },
+ input_it, weights_it, biases_it, output_it);
}
template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier,
+ std::vector<int> output_multiplier,
+ std::vector<int> output_shift,
+ const Window &window,
+ bool has_biases) // NOLINT
{
constexpr int half_vec = vector_size / 2;
@@ -355,11 +406,15 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+ const auto run_info =
+ DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
- const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
- const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
- const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
+ const auto input_qoffset_vec = wrapper::vreinterpret(
+ wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
+ const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(
+ wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
+ const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset,
+ arm_compute::wrapper::traits::vector_128_tag{});
const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
@@ -389,7 +444,7 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
Iterator output_it(dst, win_output);
Iterator biases_it{};
- if(has_biases)
+ if (has_biases)
{
biases_it = Iterator(biases, win_weights);
}
@@ -397,95 +452,117 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- std::fill(begin(acc0), end(acc0), zero);
- std::fill(begin(acc1), end(acc1), zero);
+ execute_window_loop(
+ execution_window,
+ [&](const Coordinates &id)
+ {
+ std::fill(begin(acc0), end(acc0), zero);
+ std::fill(begin(acc1), end(acc1), zero);
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- const int32_t current_h = input_z + h * dilation.y();
- if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
+ auto weights_ptr = weights_it.ptr();
+ for (size_t h = 0; h < run_info.weights_height; ++h)
{
- int offs = input_offset;
- for(size_t w = 0; w < run_info.weights_width; ++w)
+ const int32_t current_h = input_z + h * dilation.y();
+ if (current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
{
- const int32_t current_w = input_y + w * dilation.x();
- if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
+ int offs = input_offset;
+ for (size_t w = 0; w < run_info.weights_width; ++w)
{
- const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
- const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
- const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
-
- for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+ const int32_t current_w = input_y + w * dilation.x();
+ if (current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
{
- const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
- const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
- const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
-
- acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
- acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
+ const auto input_8x8 = wrapper::vdup_n(
+ *(reinterpret_cast<T *>(
+ input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))),
+ TagType{});
+ const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
+ const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
+
+ for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+ {
+ const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(
+ weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+ const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
+ const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
+
+ acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs),
+ wrapper::vgetlow(weights_no_offs));
+ acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs),
+ wrapper::vgethigh(weights_no_offs));
+ }
}
- }
- offs += dilation.x() * run_info.input_stride_y;
+ offs += dilation.x() * run_info.input_stride_y;
+ }
}
- }
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
+ }
- for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
- {
- if(has_biases)
+ for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
{
- const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
- const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
+ if (has_biases)
+ {
+ const auto bias_val0 =
+ wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+ const auto bias_val1 = wrapper::vloadq(
+ reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
- acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
- acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
- }
+ acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
+ acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
+ }
- if(out_shift < 0)
- {
- acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
- acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
- }
- else
- {
- acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
- acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
- }
+ if (out_shift < 0)
+ {
+ acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul),
+ output_qoffset_vec);
+ acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul),
+ output_qoffset_vec);
+ }
+ else
+ {
+ acc0.at(i) = wrapper::vadd(
+ rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift),
+ output_qoffset_vec);
+ acc1.at(i) = wrapper::vadd(
+ rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift),
+ output_qoffset_vec);
+ }
- acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
- acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
+ acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
+ acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
- const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
- wrapper::vmovn(acc1.at(i)));
+ const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), wrapper::vmovn(acc1.at(i)));
- if(std::is_same<T, uint8_t>::value)
- {
- wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
- }
- else
- {
- wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
+ if (std::is_same<T, uint8_t>::value)
+ {
+ wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)),
+ wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
+ }
+ else
+ {
+ wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)),
+ wrapper::vqmovn(out_val));
+ }
}
- }
- },
- input_it, weights_it, biases_it, output_it);
+ },
+ input_it, weights_it, biases_it, output_it);
}
} // namespace
template <typename T, typename TW>
-void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void run_depthwise_quanitized8bit(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info)
{
PadStrideInfo conv_info = info.pad_stride_info;
unsigned int depth_multiplier = info.depth_multiplier;
@@ -497,15 +574,15 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co
const auto output_scale = dst->info()->quantization_info().uniform().scale;
auto weights_scale = weights->info()->quantization_info().scale();
- if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
+ if (!is_data_type_quantized_per_channel(weights->info()->data_type()))
{
- for(size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
+ for (size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
{
weights_scale.push_back(weights_scale.front());
}
}
- for(const auto &s : weights_scale)
+ for (const auto &s : weights_scale)
{
int32_t out_mult = 0;
int32_t out_shift = 0;
@@ -516,30 +593,49 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co
output_shift.push_back(out_shift);
}
- if(depth_multiplier == 1)
+ if (depth_multiplier == 1)
{
- depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier, output_shift, window, has_biases);
+ depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier,
+ output_shift, window, has_biases);
}
else
{
const bool is_pow2 = ((depth_multiplier & (depth_multiplier - 1)) == 0);
const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
- if(is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
+ if (is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
{
- depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
+ depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation,
+ depth_multiplier, output_multiplier, output_shift, window,
+ has_biases);
}
else
{
- depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
+ depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier,
+ output_multiplier, output_shift, window, has_biases);
}
}
}
-template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info);
} // namespace cpu
} // namespace arm_compute