aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-09-12 20:11:34 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commita799ce0ad775829862891dd98d1232638ec8761e (patch)
tree4b7bb9b080a44aa5cfff67b2ce7177929b22405f /src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
parentd63dfa2fc61a33b4e675ec6bc7458d8700174134 (diff)
downloadComputeLibrary-a799ce0ad775829862891dd98d1232638ec8761e.tar.gz
COMPMID-1564: Add NEDepthwiseConvolution3x3 for QASYMM8
Change-Id: I1f55508af6f220e5f41df7b56daffb4761ed0591 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/148253 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp')
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp104
1 files changed, 94 insertions, 10 deletions
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 864c63f731..a571d54501 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -194,8 +194,8 @@ inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
template <typename T1, typename T2, bool in_place, bool has_bias>
-void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
@@ -304,14 +304,14 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window
internal_vst1q(out_ptr, internal_vld1q(in_ptr));
}
},
- in, bi);
+ in, bi, out);
}
}
// QASYMM8 specializations
template <>
-void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
uint8x16_t min = vdupq_n_u8(0);
@@ -352,8 +352,8 @@ void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *
in, out);
}
template <>
-void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
ARM_COMPUTE_UNUSED(bias);
@@ -382,6 +382,85 @@ void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor
},
in, out);
}
+template <>
+void output_stage_nhwc<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+ const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+ uint8x16_t min = vdupq_n_u8(0);
+ uint8x16_t max = vdupq_n_u8(255);
+
+ Window window_bias = window;
+ window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ window_bias.set(3, Window::Dimension(0, 0, 0));
+
+ Iterator in(input, window);
+ Iterator bi(bias, window_bias);
+
+ Iterator out(output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+ const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr());
+
+ // Accumulate bias
+ int32x4x4_t v_in =
+ {
+ {
+ vaddq_s32(vld1q_s32(in_ptr), vld1q_s32(bias_ptr)),
+ vaddq_s32(vld1q_s32(in_ptr + 4), vld1q_s32(bias_ptr + 4)),
+ vaddq_s32(vld1q_s32(in_ptr + 8), vld1q_s32(bias_ptr + 8)),
+ vaddq_s32(vld1q_s32(in_ptr + 12), vld1q_s32(bias_ptr + 12))
+ }
+ };
+
+ const auto out_ptr = out.ptr();
+ vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+ },
+ in, bi, out);
+}
+template <>
+void output_stage_nhwc<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+ ARM_COMPUTE_UNUSED(bias);
+
+ const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+ uint8x16_t min = vdupq_n_u8(0);
+ uint8x16_t max = vdupq_n_u8(255);
+
+ Window window_bias = window;
+ window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ window_bias.set(3, Window::Dimension(0, 0, 0));
+
+ Iterator in(input, window);
+ Iterator bi(bias, window_bias);
+
+ Iterator out(output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+
+ // Accumulate bias
+ int32x4x4_t v_in =
+ {
+ {
+ vld1q_s32(in_ptr),
+ vld1q_s32(in_ptr + 4),
+ vld1q_s32(in_ptr + 8),
+ vld1q_s32(in_ptr + 12)
+ }
+ };
+
+ const auto out_ptr = out.ptr();
+ vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+ },
+ in, bi, out);
+}
} // namespace
NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel()
@@ -426,19 +505,19 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
{
case DataType::S32:
{
- _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
+ _func = (bias == nullptr) ? &output_stage_nchw<int32_t, uint8_t, false, false> : &output_stage_nchw<int32_t, uint8_t, false, true>;
break;
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
- _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
+ _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, true> : &output_stage_nchw<float16_t, float16_t, false, true>;
break;
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
- _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
+ _func = (output == nullptr) ? &output_stage_nchw<float, float, true, true> : &output_stage_nchw<float, float, false, true>;
break;
}
default:
@@ -451,6 +530,11 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
{
switch(input->info()->data_type())
{
+ case DataType::S32:
+ {
+ _func = (output == nullptr) ? &output_stage_nhwc<int32_t, uint8_t, false, false> : &output_stage_nhwc<int32_t, uint8_t, false, true>;
+ break;
+ }
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{