From 70d43a3671090d7ab104909a9433c88e02593038 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Mon, 22 Jun 2020 17:05:43 +0100
Subject: COMPMID-3538: Remove templates from
 NEGEMMLowpOffsetContributionOutputStageKernel

This change reduces the core's library size by 191Kb.

Change-Id: Ifb8eb0d7f8bc7713f2368803a62a4c9277cc5c87
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3439
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/NEON/NEAsymm.h                    |  91 +++----
 ...NEGEMMLowpOffsetContributionOutputStageKernel.h |  26 +-
 .../NEDirectConvolutionLayerOutputStageKernel.cpp  |   4 +-
 ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 285 +++++++++------------
 ...ntizeDownInt32ToInt8ScaleByFixedPointKernel.cpp |  14 +-
 ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp |  10 +-
 6 files changed, 190 insertions(+), 240 deletions(-)
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index e4f4250d16..8558706c4d 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,24 +67,23 @@ int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4
 
 /** Performs final quantization step on 16 elements
  *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_u8                        Relu lower bound
- * @param max_u8                        Relu upper bound
+ * @param[in] in_s32                        Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_u8                        Relu lower bound
+ * @param[in] max_u8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
  *
  * @return Quantized values
  */
-template <bool is_bounded_relu>
-uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
-                                 int          result_fixedpoint_multiplier,
-                                 int32_t      result_shift,
-                                 int32x4_t    result_offset_after_shift_s32,
-                                 uint8x16_t   min_u8,
-                                 uint8x16_t   max_u8)
+inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
+                                        int          result_fixedpoint_multiplier,
+                                        int32_t      result_shift,
+                                        int32x4_t    result_offset_after_shift_s32,
+                                        uint8x16_t   min_u8,
+                                        uint8x16_t   max_u8,
+                                        bool         is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -150,24 +149,23 @@ uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
 
 /** Performs final quantization step on 16 elements
  *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_s8                        Relu lower bound
- * @param max_s8                        Relu upper bound
+ * @param[in] in_s32                        Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8                        Relu lower bound
+ * @param[in] max_s8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
  *
  * @return Quantized values
  */
-template <bool is_bounded_relu>
-int8x16_t finalize_quantization(int32x4x4_t &in_s32,
-                                int          result_fixedpoint_multiplier,
-                                int32_t      result_shift,
-                                int32x4_t    result_offset_after_shift_s32,
-                                int8x16_t    min_s8,
-                                int8x16_t    max_s8)
+inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
+                                       int          result_fixedpoint_multiplier,
+                                       int32_t      result_shift,
+                                       int32x4_t    result_offset_after_shift_s32,
+                                       int8x16_t    min_s8,
+                                       int8x16_t    max_s8,
+                                       bool         is_bounded_relu)
 {
     if(result_shift < 0)
     {
@@ -225,24 +223,23 @@ int8x16_t finalize_quantization(int32x4x4_t &in_s32,
 
 /** Performs final quantization step on 16 elements for symmetric quantization
  *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_s8                        Relu lower bound
- * @param max_s8                        Relu upper bound
+ * @param[in] in_s32                        Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8                        Relu lower bound
+ * @param[in] max_s8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
  *
  * @return Quantized values
  */
-template <bool   is_bounded_relu>
 inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
                                             const int32x4x4_t &result_fixedpoint_multiplier,
                                             const int32x4x4_t &result_shift,
                                             const int32x4_t   &result_offset_after_shift_s32,
                                             const int8x16_t   &min_s8,
-                                            const int8x16_t   &max_s8)
+                                            const int8x16_t   &max_s8,
+                                            const bool         is_bounded_relu)
 {
     const static int32x4_t one_s32 = vdupq_n_s32(1);
 
@@ -321,8 +318,6 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
 }
 
 /** Performs final quantization step on single element
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
  *
  * @param[in] in_value                      Input to be quantized.
  * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
@@ -330,13 +325,13 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
  * @param[in] result_offset_after_shift_s32 Result offset parameter
  * @param[in] min_u8                        Relu lower bound
  * @param[in] max_u8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
  *
  * @return Quantized value
  */
-template <bool is_bounded_relu>
 inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
                                      int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                     uint8_t min_u8, uint8_t max_u8)
+                                     uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
@@ -366,8 +361,6 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
 }
 
 /** Performs final quantization step on single element
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
  *
  * @param[in] in_value                      Input to be quantized.
  * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
@@ -375,13 +368,13 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
  * @param[in] result_offset_after_shift_s32 Result offset parameter
  * @param[in] min_s8                        Relu lower bound
  * @param[in] max_s8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
  *
  * @return Quantized value
  */
-template <bool is_bounded_relu>
 inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
                                     int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                    int8_t min_s8, int8_t max_s8)
+                                    int8_t min_s8, int8_t max_s8, bool is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
index 0dc64c9842..203b26e422 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -115,22 +115,18 @@ public:
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
-    using NEGEMMLowpOffsetContributionOutputStageFunction = std::function<void(const Window, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                                               ITensor *, int32_t, int32_t, int32_t, bool, GEMMLowpOutputStageInfo)>;
-
 private:
     /** Function to use for the particular tensors passed to configure() */
-    NEGEMMLowpOffsetContributionOutputStageFunction _function;
-    const ITensor                                  *_vector_sum_col;
-    const ITensor                                  *_vector_sum_row;
-    const ITensor                                  *_bias;
-    const ITensor                                  *_mm_result;
-    ITensor                                        *_output;
-    int32_t                                         _a_offset;
-    int32_t                                         _b_offset;
-    int32_t                                         _k_offset;
-    bool                                            _slide_vector_sum_col;
-    GEMMLowpOutputStageInfo                         _output_stage;
+    const ITensor          *_vector_sum_col;
+    const ITensor          *_vector_sum_row;
+    const ITensor          *_bias;
+    const ITensor          *_mm_result;
+    ITensor                *_output;
+    int32_t                 _a_offset;
+    int32_t                 _b_offset;
+    int32_t                 _k_offset;
+    bool                    _slide_vector_sum_col;
+    GEMMLowpOutputStageInfo _output_stage;
 };
 } // namespace arm_compute
 
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 2f106a3f79..c016e2836c 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -275,7 +275,7 @@ void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window
         }
 
         const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-        wrapper::vstore(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+        wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
     },
     in, out);
 }
@@ -326,7 +326,7 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window
         }
 
         const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-        wrapper::vstore(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+        wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
     },
     in, bi, out);
 }
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index 31414e3f3f..019a204196 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -155,8 +155,7 @@ inline int32x4x4_t get_k_offset(int32_t k_offset)
     };
 }
 
-template <bool    is_bounded_relu>
-inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8)
+inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -193,8 +192,7 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
     return out_u8;
 }
 
-template <bool   is_bounded_relu>
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -231,8 +229,7 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
     return out_s8;
 }
 
-template <bool   is_bounded_relu>
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -307,13 +304,13 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias)
     return bias_it;
 }
 
-template <typename VT, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
+template <typename VT>
 inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
                                                         const int32x4_t result_offset_s32, const int32x4_t result_shift_s32,
                                                         typename VT::vtype min_vec, typename VT::vtype max_vec,
                                                         int32_t a_offset, int32_t b_offset, int32_t k_offset,
                                                         int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                        int window_step_x, int window_start_x, int window_end_x)
+                                                        int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
 {
     int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
     if(!is_fixed_point)
@@ -355,12 +352,12 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
         if(is_fixed_point)
         {
             wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec));
+                            finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
         }
         else
         {
             wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization_floating_point<is_bounded_relu>(in_s32, result_shift_s32, min_vec, max_vec));
+                            finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
         }
     }
     // Compute left-over elements
@@ -380,9 +377,9 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
         if(is_fixed_point)
         {
             // Finalize and store the result
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset,
-                                                                                                               static_cast<typename VT::stype>(min_bound),
-                                                                                                               static_cast<typename VT::stype>(max_bound));
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset,
+                                                                                              static_cast<typename VT::stype>(min_bound),
+                                                                                              static_cast<typename VT::stype>(max_bound), is_bounded_relu);
         }
         else
         {
@@ -400,12 +397,11 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
     }
 }
 
-template <bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
 inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
                                                              const int32_t *result_multipliers, const int32_t *result_shifts,
                                                              const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
                                                              int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                             int window_step_x, int window_start_x, int window_end_x)
+                                                             int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
 {
     int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
     if(!is_fixed_point)
@@ -435,11 +431,11 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
 
         if(is_fixed_point)
         {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm<is_bounded_relu>(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8));
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu));
         }
         else
         {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point<is_bounded_relu>(in_s32, load(result_shifts, x), min_s8, max_s8));
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
         }
     }
     // Compute left-over elements
@@ -459,7 +455,7 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
         if(is_fixed_point)
         {
             // Finalize and store the result
-            *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound));
+            *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
         }
         else
         {
@@ -476,11 +472,11 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
     }
 }
 
-template <typename T, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
+template <typename T>
 void run_offset_contribution_output_stage(const Window &window,
                                           const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
                                           int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
-                                          GEMMLowpOutputStageInfo output_stage)
+                                          GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
 {
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
     using Typer        = VectorTyper<T>;
@@ -533,13 +529,13 @@ void run_offset_contribution_output_stage(const Window &window,
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer, true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
-                                                                                                                      mm_result_it,
-                                                                                                                      out_it,
-                                                                                                                      result_offset_s32, result_shift_s32,
-                                                                                                                      min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                      multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                      window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
+                                                                   mm_result_it,
+                                                                   out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
@@ -551,11 +547,11 @@ void run_offset_contribution_output_stage(const Window &window,
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer, true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                                                                       result_offset_s32, result_shift_s32,
-                                                                                                                       min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                       multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                       window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
         }
@@ -576,12 +572,12 @@ void run_offset_contribution_output_stage(const Window &window,
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer, false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                                                                       out_it,
-                                                                                                                       result_offset_s32, result_shift_s32,
-                                                                                                                       min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                       multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                       window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                                                                   out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point);
             },
             vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
@@ -592,11 +588,11 @@ void run_offset_contribution_output_stage(const Window &window,
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer, false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                                                                        result_offset_s32, result_shift_s32,
-                                                                                                                        min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                        multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                        window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point);
             },
             vector_sum_row_it, mm_result_it, out_it);
         }
@@ -617,12 +613,12 @@ void run_offset_contribution_output_stage(const Window &window,
             {
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window<Typer, true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                                                                       out_it,
-                                                                                                                       result_offset_s32, result_shift_s32,
-                                                                                                                       min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                       multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                       window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                                                                   out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
@@ -632,11 +628,11 @@ void run_offset_contribution_output_stage(const Window &window,
             {
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window<Typer, true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                                                                        result_offset_s32, result_shift_s32,
-                                                                                                                        min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                        multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                        window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, mm_result_it, out_it);
         }
@@ -648,11 +644,11 @@ void run_offset_contribution_output_stage(const Window &window,
             Iterator bias_it = get_bias_it(collapsed_window, bias);
             execute_window_loop(collapsed_window, [&](const Coordinates &)
             {
-                run_offset_contribution_output_stage_window<Typer, false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                                                                        result_offset_s32, result_shift_s32,
-                                                                                                                        min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                        multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                        window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point);
             },
             bias_it, mm_result_it, out_it);
         }
@@ -660,11 +656,11 @@ void run_offset_contribution_output_stage(const Window &window,
         {
             execute_window_loop(collapsed_window, [&](const Coordinates &)
             {
-                run_offset_contribution_output_stage_window<Typer, false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                                                                         result_offset_s32, result_shift_s32,
-                                                                                                                         min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                         multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                         window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point);
             },
             mm_result_it, out_it);
         }
@@ -672,11 +668,10 @@ void run_offset_contribution_output_stage(const Window &window,
     }
 }
 
-template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
 void run_offset_contribution_output_stage_symm(const Window &window,
                                                const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
                                                int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
-                                               GEMMLowpOutputStageInfo output_stage)
+                                               GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
 {
     ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
 
@@ -720,11 +715,11 @@ void run_offset_contribution_output_stage_symm(const Window &window,
             {
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window_symm<true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                                                              result_multipliers, result_shifts,
-                                                                                                              result_offset_s32, min_s8, max_s8,
-                                                                                                              a_offset, offset, min_bound, max_bound,
-                                                                                                              window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                 result_multipliers, result_shifts,
+                                                                 result_offset_s32, min_s8, max_s8,
+                                                                 a_offset, offset, min_bound, max_bound,
+                                                                 window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
@@ -734,11 +729,11 @@ void run_offset_contribution_output_stage_symm(const Window &window,
             {
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window_symm<true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
-                                                                                                               result_multipliers, result_shifts,
-                                                                                                               result_offset_s32, min_s8, max_s8,
-                                                                                                               a_offset, offset, min_bound, max_bound,
-                                                                                                               window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
+                                                                 result_multipliers, result_shifts,
+                                                                 result_offset_s32, min_s8, max_s8,
+                                                                 a_offset, offset, min_bound, max_bound,
+                                                                 window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, mm_result_it, out_it);
         }
@@ -750,11 +745,11 @@ void run_offset_contribution_output_stage_symm(const Window &window,
             Iterator bias_it = get_bias_it(collapsed_window, bias);
             execute_window_loop(collapsed_window, [&](const Coordinates &)
             {
-                run_offset_contribution_output_stage_window_symm<false, true, is_bounded_relu, is_fixed_point>(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                                                               result_multipliers, result_shifts,
-                                                                                                               result_offset_s32, min_s8, max_s8,
-                                                                                                               a_offset, offset, min_bound, max_bound,
-                                                                                                               window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                 result_multipliers, result_shifts,
+                                                                 result_offset_s32, min_s8, max_s8,
+                                                                 a_offset, offset, min_bound, max_bound,
+                                                                 window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point);
             },
             bias_it, mm_result_it, out_it);
         }
@@ -762,11 +757,11 @@ void run_offset_contribution_output_stage_symm(const Window &window,
         {
             execute_window_loop(collapsed_window, [&](const Coordinates &)
             {
-                run_offset_contribution_output_stage_window_symm<false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, mm_result_it, out_it,
-                                                                                                                result_multipliers, result_shifts,
-                                                                                                                result_offset_s32, min_s8, max_s8,
-                                                                                                                a_offset, offset, min_bound, max_bound,
-                                                                                                                window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it,
+                                                                 result_multipliers, result_shifts,
+                                                                 result_offset_s32, min_s8, max_s8,
+                                                                 a_offset, offset, min_bound, max_bound,
+                                                                 window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point);
             },
             mm_result_it, out_it);
         }
@@ -860,81 +855,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result,
 
     return std::make_pair(Status{}, win);
 }
-
-NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction
-get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage)
-{
-    static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function_qasymm =
-    {
-        { 0, &run_offset_contribution_output_stage<uint8_t, false, false, false> },
-        { 1, &run_offset_contribution_output_stage<uint8_t, true, false, false> },
-        { 2, &run_offset_contribution_output_stage<uint8_t, false, true, false> },
-        { 3, &run_offset_contribution_output_stage<uint8_t, true, true, false> },
-        { 4, &run_offset_contribution_output_stage<uint8_t, false, false, true> },
-        { 5, &run_offset_contribution_output_stage<uint8_t, true, false, true> },
-        { 6, &run_offset_contribution_output_stage<uint8_t, false, true, true> },
-        { 7, &run_offset_contribution_output_stage<uint8_t, true, true, true> },
-        { 8, &run_offset_contribution_output_stage<int8_t, false, false, false> },
-        { 9, &run_offset_contribution_output_stage<int8_t, true, false, false> },
-        { 10, &run_offset_contribution_output_stage<int8_t, false, true, false> },
-        { 11, &run_offset_contribution_output_stage<int8_t, true, true, false> },
-        { 12, &run_offset_contribution_output_stage<int8_t, false, false, true> },
-        { 13, &run_offset_contribution_output_stage<int8_t, true, false, true> },
-        { 14, &run_offset_contribution_output_stage<int8_t, false, true, true> },
-        { 15, &run_offset_contribution_output_stage<int8_t, true, true, true> },
-    };
-
-    static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function_qsymm =
-    {
-        { 0, &run_offset_contribution_output_stage_symm<false, false, false> },
-        { 1, &run_offset_contribution_output_stage_symm<true, false, false> },
-        { 2, &run_offset_contribution_output_stage_symm<false, true, false> },
-        { 3, &run_offset_contribution_output_stage_symm<true, true, false> },
-        { 4, &run_offset_contribution_output_stage_symm<false, false, true> },
-        { 5, &run_offset_contribution_output_stage_symm<true, false, true> },
-        { 6, &run_offset_contribution_output_stage_symm<false, true, true> },
-        { 7, &run_offset_contribution_output_stage_symm<true, true, true> }
-    };
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
-    // Check if we need to clamp the result using min and max
-    PixelValue type_min{};
-    PixelValue type_max{};
-    std::tie(type_min, type_max) = get_min_max(output->info()->data_type());
-    int32_t    type_min_int    = type_min.get<int32_t>();
-    int32_t    type_max_int    = type_max.get<int32_t>();
-    const bool is_bounded_relu = !(output_stage.gemmlowp_min_bound <= type_min_int && output_stage.gemmlowp_max_bound >= type_max_int);
-
-    // Check if we need to perform fixed point requantization
-    const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
-
-    // Check if symmetric per-channel execution
-    const bool is_signed = output->info()->data_type() == DataType::QASYMM8_SIGNED;
-
-    // Check if symmetric per-channel execution
-    const bool is_symm = output_stage.is_quantized_per_channel;
-
-    // key acts as a bitset, setting the first bit on reinterpret_as_3d,
-    // the second on is_bounded_relu, and the third on is_fixed_point.
-    uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2);
-    if(is_symm)
-    {
-        return map_function_qsymm.find(key)->second;
-    }
-    else
-    {
-        key |= ((is_signed ? 1UL : 0UL) << 3);
-        return map_function_qasymm.find(key)->second;
-    }
-}
 } // namespace
 
 NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageKernel()
-    : _function(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true),
+    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true),
       _output_stage(GEMMLowpOutputStageInfo())
 
 {
@@ -977,8 +901,6 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_
     auto win_config = validate_and_configure_window(mm_result->info(), output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
-
-    _function = get_configured_function(mm_result, vector_sum_row, output, output_stage);
 }
 
 Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
@@ -996,7 +918,46 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, co
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    _function(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage);
+
+    PixelValue type_min{};
+    PixelValue type_max{};
+    std::tie(type_min, type_max) = get_min_max(_output->info()->data_type());
+    int32_t type_min_int = type_min.get<int32_t>();
+    int32_t type_max_int = type_max.get<int32_t>();
+
+    const bool reinterpret_as_3d = _vector_sum_row != nullptr
+                                   && _mm_result->info()->num_dimensions() > 1
+                                   && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x();
+
+    const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
+
+    // Check if we need to perform fixed point requantization
+    const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
+
+    // Check if symmetric per-channel execution
+    const bool is_signed = _output->info()->data_type() == DataType::QASYMM8_SIGNED;
+
+    // Check if symmetric per-channel execution
+    const bool is_symm = _output_stage.is_quantized_per_channel;
+
+    if(is_symm)
+    {
+        run_offset_contribution_output_stage_symm(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+                                                  reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+    }
+    else
+    {
+        if(is_signed)
+        {
+            run_offset_contribution_output_stage<int8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+                                                         reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+        }
+        else
+        {
+            run_offset_contribution_output_stage<uint8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+                                                          reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+        }
+    }
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index b8ca17ec3d..9400c9704a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -140,7 +140,7 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window
                 in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
 
                 vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8));
+                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
             }
 
             // Compute left-over elements
@@ -152,8 +152,8 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window
                 // Add bias
                 in_value += bias_value;
                 // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                                    static_cast<int8_t>(_min), static_cast<int8_t>(_max));
+                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
             }
         },
         in, out, bias);
@@ -177,7 +177,7 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window
                 };
 
                 vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8));
+                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
             }
 
             // Compute left-over elements
@@ -186,8 +186,8 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window
                 const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
                 // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                                    static_cast<int8_t>(_min), static_cast<int8_t>(_max));
+                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
             }
         },
         in, out);
@@ -242,4 +242,4 @@ void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window
 
     (this->*_func)(window);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 4a9d2f7481..78610c95a7 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -145,7 +145,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
                 in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
                 in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
 
-                vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));
+                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
             }
 
             // Compute left-over elements
@@ -157,7 +157,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
                 // Add bias
                 in_value += bias_value;
                 // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
+                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
             }
         },
         in, out, bias);
@@ -180,7 +180,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
                     }
                 };
 
-                vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));
+                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
             }
 
             // Compute left-over elements
@@ -189,7 +189,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
                 const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
                 // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
+                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
             }
         },
         in, out);
@@ -244,4 +244,4 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
 
     (this->*_func)(window);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
-- 
cgit v1.2.1