aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2020-06-22 17:05:43 +0100
committerMichalis Spyrou <michalis.spyrou@arm.com>2020-06-25 13:21:00 +0000
commit70d43a3671090d7ab104909a9433c88e02593038 (patch)
tree44394282795be5b17bebb65f228303dbf7600bfe
parentc41a6a611973cb245220641e06f8fa984b156954 (diff)
downloadComputeLibrary-70d43a3671090d7ab104909a9433c88e02593038.tar.gz
COMPMID-3538: Remove templates from NEGEMMLowpOffsetContributionOutputStageKernel
This change reduces the core's library size by 191Kb. Change-Id: Ifb8eb0d7f8bc7713f2368803a62a4c9277cc5c87 Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3439 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/NEAsymm.h91
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h26
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp4
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp285
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp14
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp10
6 files changed, 190 insertions, 240 deletions
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index e4f4250d16..8558706c4d 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -67,24 +67,23 @@ int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4
/** Performs final quantization step on 16 elements
*
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32 Input to be quantized.
- * @param result_fixedpoint_multiplier Result multiplier parameter
- * @param result_shift Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_u8 Relu lower bound
- * @param max_u8 Relu upper bound
+ * @param[in] in_s32 Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier Result multiplier parameter
+ * @param[in] result_shift Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_u8 Relu lower bound
+ * @param[in] max_u8 Relu upper bound
+ * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied
*
* @return Quantized values
*/
-template <bool is_bounded_relu>
-uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
- int result_fixedpoint_multiplier,
- int32_t result_shift,
- int32x4_t result_offset_after_shift_s32,
- uint8x16_t min_u8,
- uint8x16_t max_u8)
+inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
+ int result_fixedpoint_multiplier,
+ int32_t result_shift,
+ int32x4_t result_offset_after_shift_s32,
+ uint8x16_t min_u8,
+ uint8x16_t max_u8,
+ bool is_bounded_relu)
{
const static int32x4_t zero_s32 = vdupq_n_s32(0);
@@ -150,24 +149,23 @@ uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
/** Performs final quantization step on 16 elements
*
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32 Input to be quantized.
- * @param result_fixedpoint_multiplier Result multiplier parameter
- * @param result_shift Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_s8 Relu lower bound
- * @param max_s8 Relu upper bound
+ * @param[in] in_s32 Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier Result multiplier parameter
+ * @param[in] result_shift Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8 Relu lower bound
+ * @param[in] max_s8 Relu upper bound
+ * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied
*
* @return Quantized values
*/
-template <bool is_bounded_relu>
-int8x16_t finalize_quantization(int32x4x4_t &in_s32,
- int result_fixedpoint_multiplier,
- int32_t result_shift,
- int32x4_t result_offset_after_shift_s32,
- int8x16_t min_s8,
- int8x16_t max_s8)
+inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
+ int result_fixedpoint_multiplier,
+ int32_t result_shift,
+ int32x4_t result_offset_after_shift_s32,
+ int8x16_t min_s8,
+ int8x16_t max_s8,
+ bool is_bounded_relu)
{
if(result_shift < 0)
{
@@ -225,24 +223,23 @@ int8x16_t finalize_quantization(int32x4x4_t &in_s32,
/** Performs final quantization step on 16 elements for symmetric quantization
*
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32 Input to be quantized.
- * @param result_fixedpoint_multiplier Result multiplier parameter
- * @param result_shift Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_s8 Relu lower bound
- * @param max_s8 Relu upper bound
+ * @param[in] in_s32 Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier Result multiplier parameter
+ * @param[in] result_shift Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8 Relu lower bound
+ * @param[in] max_s8 Relu upper bound
+ * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied
*
* @return Quantized values
*/
-template <bool is_bounded_relu>
inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32,
const int32x4x4_t &result_fixedpoint_multiplier,
const int32x4x4_t &result_shift,
const int32x4_t &result_offset_after_shift_s32,
const int8x16_t &min_s8,
- const int8x16_t &max_s8)
+ const int8x16_t &max_s8,
+ const bool is_bounded_relu)
{
const static int32x4_t one_s32 = vdupq_n_s32(1);
@@ -322,21 +319,19 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32,
/** Performs final quantization step on single element
*
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
* @param[in] in_value Input to be quantized.
* @param[in] result_fixedpoint_multiplier Result multiplier parameter
* @param[in] result_shift Result shift parameter
* @param[in] result_offset_after_shift_s32 Result offset parameter
* @param[in] min_u8 Relu lower bound
* @param[in] max_u8 Relu upper bound
+ * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied
*
* @return Quantized value
*/
-template <bool is_bounded_relu>
inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
int32_t result_shift, int32_t result_offset_after_shift_s32,
- uint8_t min_u8, uint8_t max_u8)
+ uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu)
{
int32x4_t in_s32 = vdupq_n_s32(in_value);
@@ -367,21 +362,19 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
/** Performs final quantization step on single element
*
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
* @param[in] in_value Input to be quantized.
* @param[in] result_fixedpoint_multiplier Result multiplier parameter
* @param[in] result_shift Result shift parameter
* @param[in] result_offset_after_shift_s32 Result offset parameter
* @param[in] min_s8 Relu lower bound
* @param[in] max_s8 Relu upper bound
+ * @param[in] is_bounded_relu Specified if a fused bounded relu should be applied
*
* @return Quantized value
*/
-template <bool is_bounded_relu>
inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
int32_t result_shift, int32_t result_offset_after_shift_s32,
- int8_t min_s8, int8_t max_s8)
+ int8_t min_s8, int8_t max_s8, bool is_bounded_relu)
{
int32x4_t in_s32 = vdupq_n_s32(in_value);
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
index 0dc64c9842..203b26e422 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -115,22 +115,18 @@ public:
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
- using NEGEMMLowpOffsetContributionOutputStageFunction = std::function<void(const Window, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
- ITensor *, int32_t, int32_t, int32_t, bool, GEMMLowpOutputStageInfo)>;
-
private:
/** Function to use for the particular tensors passed to configure() */
- NEGEMMLowpOffsetContributionOutputStageFunction _function;
- const ITensor *_vector_sum_col;
- const ITensor *_vector_sum_row;
- const ITensor *_bias;
- const ITensor *_mm_result;
- ITensor *_output;
- int32_t _a_offset;
- int32_t _b_offset;
- int32_t _k_offset;
- bool _slide_vector_sum_col;
- GEMMLowpOutputStageInfo _output_stage;
+ const ITensor *_vector_sum_col;
+ const ITensor *_vector_sum_row;
+ const ITensor *_bias;
+ const ITensor *_mm_result;
+ ITensor *_output;
+ int32_t _a_offset;
+ int32_t _b_offset;
+ int32_t _k_offset;
+ bool _slide_vector_sum_col;
+ GEMMLowpOutputStageInfo _output_stage;
};
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 2f106a3f79..c016e2836c 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -275,7 +275,7 @@ void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window
}
const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
- wrapper::vstore(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+ wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
},
in, out);
}
@@ -326,7 +326,7 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window
}
const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
- wrapper::vstore(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+ wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
},
in, bi, out);
}
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index 31414e3f3f..019a204196 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -155,8 +155,7 @@ inline int32x4x4_t get_k_offset(int32_t k_offset)
};
}
-template <bool is_bounded_relu>
-inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8)
+inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
{
const static int32x4_t zero_s32 = vdupq_n_s32(0);
@@ -193,8 +192,7 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
return out_u8;
}
-template <bool is_bounded_relu>
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
{
const static int32x4_t zero_s32 = vdupq_n_s32(0);
@@ -231,8 +229,7 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
return out_s8;
}
-template <bool is_bounded_relu>
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
{
const static int32x4_t zero_s32 = vdupq_n_s32(0);
@@ -307,13 +304,13 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias)
return bias_it;
}
-template <typename VT, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
+template <typename VT>
inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
const int32x4_t result_offset_s32, const int32x4_t result_shift_s32,
typename VT::vtype min_vec, typename VT::vtype max_vec,
int32_t a_offset, int32_t b_offset, int32_t k_offset,
int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
- int window_step_x, int window_start_x, int window_end_x)
+ int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
{
int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
if(!is_fixed_point)
@@ -355,12 +352,12 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
if(is_fixed_point)
{
wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
- finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec));
+ finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
}
else
{
wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
- finalize_quantization_floating_point<is_bounded_relu>(in_s32, result_shift_s32, min_vec, max_vec));
+ finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
}
}
// Compute left-over elements
@@ -380,9 +377,9 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
if(is_fixed_point)
{
// Finalize and store the result
- *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset,
- static_cast<typename VT::stype>(min_bound),
- static_cast<typename VT::stype>(max_bound));
+ *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset,
+ static_cast<typename VT::stype>(min_bound),
+ static_cast<typename VT::stype>(max_bound), is_bounded_relu);
}
else
{
@@ -400,12 +397,11 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
}
}
-template <bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
const int32_t *result_multipliers, const int32_t *result_shifts,
const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
- int window_step_x, int window_start_x, int window_end_x)
+ int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
{
int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
if(!is_fixed_point)
@@ -435,11 +431,11 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
if(is_fixed_point)
{
- vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm<is_bounded_relu>(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8));
+ vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu));
}
else
{
- vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point<is_bounded_relu>(in_s32, load(result_shifts, x), min_s8, max_s8));
+ vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
}
}
// Compute left-over elements
@@ -459,7 +455,7 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
if(is_fixed_point)
{
// Finalize and store the result
- *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound));
+ *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
}
else
{
@@ -476,11 +472,11 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
}
}
-template <typename T, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
+template <typename T>
void run_offset_contribution_output_stage(const Window &window,
const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
- GEMMLowpOutputStageInfo output_stage)
+ GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
{
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
using Typer = VectorTyper<T>;
@@ -533,13 +529,13 @@ void run_offset_contribution_output_stage(const Window &window,
const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+ id.y() + (id.z() % depth_input) * height_input;
- run_offset_contribution_output_stage_window<Typer, true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
- mm_result_it,
- out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
+ mm_result_it,
+ out_it,
+ result_offset_s32, result_shift_s32,
+ min_vec, max_vec, a_offset, b_offset, k_offset,
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
},
vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
}
@@ -551,11 +547,11 @@ void run_offset_contribution_output_stage(const Window &window,
const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+ id.y() + (id.z() % depth_input) * height_input;
- run_offset_contribution_output_stage_window<Typer, true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+ result_offset_s32, result_shift_s32,
+ min_vec, max_vec, a_offset, b_offset, k_offset,
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point);
},
vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
}
@@ -576,12 +572,12 @@ void run_offset_contribution_output_stage(const Window &window,
const int batch_id = id.z() / depth_input;
const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+ id.y() + (id.z() % depth_input) * height_input;
- run_offset_contribution_output_stage_window<Typer, false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
- out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+ out_it,
+ result_offset_s32, result_shift_s32,
+ min_vec, max_vec, a_offset, b_offset, k_offset,
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point);
},
vector_sum_row_it, bias_it, mm_result_it, out_it);
}
@@ -592,11 +588,11 @@ void run_offset_contribution_output_stage(const Window &window,
const int batch_id = id.z() / depth_input;
const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+ id.y() + (id.z() % depth_input) * height_input;
- run_offset_contribution_output_stage_window<Typer, false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+ result_offset_s32, result_shift_s32,
+ min_vec, max_vec, a_offset, b_offset, k_offset,
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point);
},
vector_sum_row_it, mm_result_it, out_it);
}
@@ -617,12 +613,12 @@ void run_offset_contribution_output_stage(const Window &window,
{
const int batch_id = id.z() / depth_input;
const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
- run_offset_contribution_output_stage_window<Typer, true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
- out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+ out_it,
+ result_offset_s32, result_shift_s32,
+ min_vec, max_vec, a_offset, b_offset, k_offset,
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point);
},
vector_sum_col_it, bias_it, mm_result_it, out_it);
}
@@ -632,11 +628,11 @@ void run_offset_contribution_output_stage(const Window &window,
{
const int batch_id = id.z() / depth_input;
const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
- run_offset_contribution_output_stage_window<Typer, true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
+ result_offset_s32, result_shift_s32,
+ min_vec, max_vec, a_offset, b_offset, k_offset,
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point);
},
vector_sum_col_it, mm_result_it, out_it);
}
@@ -648,11 +644,11 @@ void run_offset_contribution_output_stage(const Window &window,
Iterator bias_it = get_bias_it(collapsed_window, bias);
execute_window_loop(collapsed_window, [&](const Coordinates &)
{
- run_offset_contribution_output_stage_window<Typer, false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_offset_s32, result_shift_s32,
+ min_vec, max_vec, a_offset, b_offset, k_offset,
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point);
},
bias_it, mm_result_it, out_it);
}
@@ -660,11 +656,11 @@ void run_offset_contribution_output_stage(const Window &window,
{
execute_window_loop(collapsed_window, [&](const Coordinates &)
{
- run_offset_contribution_output_stage_window<Typer, false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it,
+ result_offset_s32, result_shift_s32,
+ min_vec, max_vec, a_offset, b_offset, k_offset,
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point);
},
mm_result_it, out_it);
}
@@ -672,11 +668,10 @@ void run_offset_contribution_output_stage(const Window &window,
}
}
-template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
void run_offset_contribution_output_stage_symm(const Window &window,
const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
- GEMMLowpOutputStageInfo output_stage)
+ GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
{
ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
@@ -720,11 +715,11 @@ void run_offset_contribution_output_stage_symm(const Window &window,
{
const int batch_id = id.z() / depth_input;
const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
- run_offset_contribution_output_stage_window_symm<true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
- result_multipliers, result_shifts,
- result_offset_s32, min_s8, max_s8,
- a_offset, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point);
},
vector_sum_col_it, bias_it, mm_result_it, out_it);
}
@@ -734,11 +729,11 @@ void run_offset_contribution_output_stage_symm(const Window &window,
{
const int batch_id = id.z() / depth_input;
const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
- run_offset_contribution_output_stage_window_symm<true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
- result_multipliers, result_shifts,
- result_offset_s32, min_s8, max_s8,
- a_offset, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
},
vector_sum_col_it, mm_result_it, out_it);
}
@@ -750,11 +745,11 @@ void run_offset_contribution_output_stage_symm(const Window &window,
Iterator bias_it = get_bias_it(collapsed_window, bias);
execute_window_loop(collapsed_window, [&](const Coordinates &)
{
- run_offset_contribution_output_stage_window_symm<false, true, is_bounded_relu, is_fixed_point>(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
- result_multipliers, result_shifts,
- result_offset_s32, min_s8, max_s8,
- a_offset, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point);
},
bias_it, mm_result_it, out_it);
}
@@ -762,11 +757,11 @@ void run_offset_contribution_output_stage_symm(const Window &window,
{
execute_window_loop(collapsed_window, [&](const Coordinates &)
{
- run_offset_contribution_output_stage_window_symm<false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, mm_result_it, out_it,
- result_multipliers, result_shifts,
- result_offset_s32, min_s8, max_s8,
- a_offset, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x);
+ run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point);
},
mm_result_it, out_it);
}
@@ -860,81 +855,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result,
return std::make_pair(Status{}, win);
}
-
-NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction
-get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage)
-{
- static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function_qasymm =
- {
- { 0, &run_offset_contribution_output_stage<uint8_t, false, false, false> },
- { 1, &run_offset_contribution_output_stage<uint8_t, true, false, false> },
- { 2, &run_offset_contribution_output_stage<uint8_t, false, true, false> },
- { 3, &run_offset_contribution_output_stage<uint8_t, true, true, false> },
- { 4, &run_offset_contribution_output_stage<uint8_t, false, false, true> },
- { 5, &run_offset_contribution_output_stage<uint8_t, true, false, true> },
- { 6, &run_offset_contribution_output_stage<uint8_t, false, true, true> },
- { 7, &run_offset_contribution_output_stage<uint8_t, true, true, true> },
- { 8, &run_offset_contribution_output_stage<int8_t, false, false, false> },
- { 9, &run_offset_contribution_output_stage<int8_t, true, false, false> },
- { 10, &run_offset_contribution_output_stage<int8_t, false, true, false> },
- { 11, &run_offset_contribution_output_stage<int8_t, true, true, false> },
- { 12, &run_offset_contribution_output_stage<int8_t, false, false, true> },
- { 13, &run_offset_contribution_output_stage<int8_t, true, false, true> },
- { 14, &run_offset_contribution_output_stage<int8_t, false, true, true> },
- { 15, &run_offset_contribution_output_stage<int8_t, true, true, true> },
- };
-
- static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function_qsymm =
- {
- { 0, &run_offset_contribution_output_stage_symm<false, false, false> },
- { 1, &run_offset_contribution_output_stage_symm<true, false, false> },
- { 2, &run_offset_contribution_output_stage_symm<false, true, false> },
- { 3, &run_offset_contribution_output_stage_symm<true, true, false> },
- { 4, &run_offset_contribution_output_stage_symm<false, false, true> },
- { 5, &run_offset_contribution_output_stage_symm<true, false, true> },
- { 6, &run_offset_contribution_output_stage_symm<false, true, true> },
- { 7, &run_offset_contribution_output_stage_symm<true, true, true> }
- };
-
- // Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = vector_sum_row != nullptr
- && mm_result->info()->num_dimensions() > 1
- && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
- // Check if we need to clamp the result using min and max
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(output->info()->data_type());
- int32_t type_min_int = type_min.get<int32_t>();
- int32_t type_max_int = type_max.get<int32_t>();
- const bool is_bounded_relu = !(output_stage.gemmlowp_min_bound <= type_min_int && output_stage.gemmlowp_max_bound >= type_max_int);
-
- // Check if we need to perform fixed point requantization
- const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
-
- // Check if symmetric per-channel execution
- const bool is_signed = output->info()->data_type() == DataType::QASYMM8_SIGNED;
-
- // Check if symmetric per-channel execution
- const bool is_symm = output_stage.is_quantized_per_channel;
-
- // key acts as a bitset, setting the first bit on reinterpret_as_3d,
- // the second on is_bounded_relu, and the third on is_fixed_point.
- uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2);
- if(is_symm)
- {
- return map_function_qsymm.find(key)->second;
- }
- else
- {
- key |= ((is_signed ? 1UL : 0UL) << 3);
- return map_function_qasymm.find(key)->second;
- }
-}
} // namespace
NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageKernel()
- : _function(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true),
+ : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true),
_output_stage(GEMMLowpOutputStageInfo())
{
@@ -977,8 +901,6 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_
auto win_config = validate_and_configure_window(mm_result->info(), output->info());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
-
- _function = get_configured_function(mm_result, vector_sum_row, output, output_stage);
}
Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
@@ -996,7 +918,46 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, co
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- _function(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage);
+
+ PixelValue type_min{};
+ PixelValue type_max{};
+ std::tie(type_min, type_max) = get_min_max(_output->info()->data_type());
+ int32_t type_min_int = type_min.get<int32_t>();
+ int32_t type_max_int = type_max.get<int32_t>();
+
+ const bool reinterpret_as_3d = _vector_sum_row != nullptr
+ && _mm_result->info()->num_dimensions() > 1
+ && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x();
+
+ const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
+
+ // Check if we need to perform fixed point requantization
+ const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
+
+ // Check if symmetric per-channel execution
+ const bool is_signed = _output->info()->data_type() == DataType::QASYMM8_SIGNED;
+
+ // Check if symmetric per-channel execution
+ const bool is_symm = _output_stage.is_quantized_per_channel;
+
+ if(is_symm)
+ {
+ run_offset_contribution_output_stage_symm(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+ reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+ }
+ else
+ {
+ if(is_signed)
+ {
+ run_offset_contribution_output_stage<int8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+ reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+ }
+ else
+ {
+ run_offset_contribution_output_stage<uint8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+ reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+ }
+ }
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index b8ca17ec3d..9400c9704a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -140,7 +140,7 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window
in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
- finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8));
+ finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
}
// Compute left-over elements
@@ -152,8 +152,8 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window
// Add bias
in_value += bias_value;
// Finalize and store the result
- *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
- static_cast<int8_t>(_min), static_cast<int8_t>(_max));
+ *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+ static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
}
},
in, out, bias);
@@ -177,7 +177,7 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window
};
vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
- finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8));
+ finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
}
// Compute left-over elements
@@ -186,8 +186,8 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window
const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
// Finalize and store the result
- *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
- static_cast<int8_t>(_min), static_cast<int8_t>(_max));
+ *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+ static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
}
},
in, out);
@@ -242,4 +242,4 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window
(this->*_func)(window);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 4a9d2f7481..78610c95a7 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -145,7 +145,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
- vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));
+ vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
}
// Compute left-over elements
@@ -157,7 +157,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
// Add bias
in_value += bias_value;
// Finalize and store the result
- *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
+ *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
}
},
in, out, bias);
@@ -180,7 +180,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
}
};
- vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));
+ vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
}
// Compute left-over elements
@@ -189,7 +189,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
// Finalize and store the result
- *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
+ *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
}
},
in, out);
@@ -244,4 +244,4 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
(this->*_func)(window);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute