COMPMID-2968: Add support for QASYMM8_SIGNED in CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel

Signed-off-by: Sheri Zhang <sheri.zhang@arm.com> Change-Id: I37e6e76dbd5546c0eaedfacd01ea905c37148e8a Signed-off-by: Sheri Zhang <sheri.zhang@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2861 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Sheri Zhang <sheri.zhang@arm.com> 2020-03-09 14:29:52 +0000
committer: Sheri Zhang <sheri.zhang@arm.com> 2020-03-25 15:58:42 +0000
commit: 1b14c75c0d591c4abe4d2d41b7e4e165fbf58382 (patch)
tree: 41e671befde3f61247d0728d16907ff281d6294d
parent: 2e5fd637205770ec5e11096e6e19b8efc67d544e (diff)
download: ComputeLibrary-1b14c75c0d591c4abe4d2d41b7e4e165fbf58382.tar.gz
12 files changed, 315 insertions, 75 deletions
diff --git a/Android.bp b/Android.bp
index f9a41000dd..0d5c9e949d 100644
--- a/Android.bp
+++ b/Android.bp
@@ -126,11 +126,11 @@ cc_library_static {
         "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp",
+        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp",
         "src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp",
         "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp",
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index f2e16ca139..b265aa2fe7 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -79,11 +79,11 @@
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
index 900a8c3b5d..439f569d07 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFLOATKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFLOATKERNEL_H
+#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
 
 #include "arm_compute/core/CL/ICLKernel.h"
 
@@ -31,9 +31,9 @@ namespace arm_compute
 // Forward declarations
 class ICLTensor;
 
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
  * The following computations will be performed by the kernel:
  *
  *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
@@ -41,47 +41,43 @@ class ICLTensor;
  *  -# Requantize
  *  -# Add offset to each result
  *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *  -# Clamp the resulting int32 values to
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
  */
-class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel : public ICLKernel
+class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel
 {
 public:
     /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel();
+    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel &) = delete;
+    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel &) = delete;
+    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
     /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel &&) = default;
+    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
     /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel &&) = default;
+    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input      Input tensor. Data type supported: S32
-     * @param[in]  bias       Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                        Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output     Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  multiplier Float multiplier to be multiplied to each element of the input matrix
-     * @param[in]  offset     Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min        (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max        (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                        Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     * @param[in]  input  Input tensor. Data type supported: S32
+     * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                    Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  info   Output stage info. Used to pass the quantized output data type
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
      *
      * @param[in] input  Input tensor. Data type supported: S32
      * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
      *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] info   Output stage info. Used to pass the quantized output data type
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -92,4 +88,4 @@ private:
     ICLTensor       *_output;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFLOATKERNEL_H */
+#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 711b68f236..37a9679a21 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1956,6 +1956,7 @@ struct GEMMLowpOutputStageInfo
     int32_t                 gemmlowp_max_bound{ std::numeric_limits<int32_t>::max() };    /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
     std::vector<int32_t>    gemmlowp_multipliers{};                                       /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
     std::vector<int32_t>    gemmlowp_shifts{};                                            /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    float                   gemmlowp_real_multiplier{ 0 };                                /**< GEMMLowp output stage real multiplier used for quantizing to QASYMM8 */
     bool                    is_quantized_per_channel{ false };                            /**< GEMMLowp quantized per-channel flag */
     DataType                output_data_type{ DataType::UNKNOWN };                        /**< Output tensor data type to use if the output is not initialized */
 };
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index 184d827d4b..05cffa6680 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -217,7 +217,7 @@ public:
  *
  *  This function calls the following OpenCL kernels:
  *
- * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel
+ * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
  *
  * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
  *       after the result is shifted right by result_shift
@@ -237,6 +237,7 @@ public:
      * @param[in]  max        (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
      *                        Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.05)
     void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset, int min = std::numeric_limits<int32_t>::lowest(),
                    int max = std::numeric_limits<int32_t>::max());
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
@@ -251,6 +252,7 @@ public:
      *
      * @return a status
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.05)
     static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
 };
 /** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on OpenCL.
@@ -317,6 +319,7 @@ public:
  *  This function calls the following CL kernels:
  *
  * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
+ * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
  * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
  * -# @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
 */
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 3fba781ede..7f2828689a 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -2317,9 +2317,9 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
 #endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
 
 #if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
-/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
  *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8 value.
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
  * The following computations will be performed by the kernel:
  *
  *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
@@ -2327,11 +2327,14 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
  *  -# Requantize
  *  -# Add offset to each result
  *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
  *
  * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
  *
  * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
  *
@@ -2388,19 +2391,20 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
 #endif // defined(ADD_BIAS)
 
     // Convert to float
-    float16 input_values_f = convert_float4(input_values);
-    input_values_f         = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
+    float4 input_values_f = convert_float4(input_values);
+    input_values_f        = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
 
-    uchar4 res = convert_uchar4_sat(input_values_f);
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+    res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
 
 #if defined(MIN_BOUND)
-    res = max(res, (uchar4)MIN_BOUND);
+    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (uchar4)MAX_BOUND);
+    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore4(res, 0, dst_addr);
+    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
 }
-#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index 7097dc9248..5a554f3111 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -21,9 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -32,7 +33,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -40,10 +41,13 @@ namespace arm_compute
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                          int min, int max)
+                          const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
+    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
+                                || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
 
     // Check biases if exist
     if(bias != nullptr)
@@ -55,15 +59,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, DataType output_data_type)
 {
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_data_type(output_data_type));
+
     constexpr unsigned int num_elems_processed_per_iteration = 4;
 
     // Output auto inizialitation if not yet initialized
@@ -77,14 +84,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     bool window_changed = update_window_and_padding(win,
                                                     input_access);
 
-    if(output->total_size() != 0)
-    {
-        Window                 win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-        AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
-
-        output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
+    AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win, output_result_access);
+    output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
 
     if(bias != nullptr)
     {
@@ -98,39 +100,39 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 class Coordinates;
-CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel()
+CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel()
     : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
 }
 
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+                                                               const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
-                                                              output->clone().get())
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
 
     return Status{};
 }
 
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                     float multiplier, int offset,
-                                                                     int min, int max)
+void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                              const GEMMLowpOutputStageInfo *info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
 
     _input  = input;
     _bias   = bias;
     _output = output;
 
+    auto min = info->gemmlowp_min_bound;
+    auto max = info->gemmlowp_max_bound;
+
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
-    build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(multiplier));
-    build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(offset));
+    build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
+    build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
+    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
     build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
@@ -139,12 +141,12 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::configure(const ICLTe
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down_float", build_opts.options()));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info->output_data_type);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 }
 
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index e86f303ff4..fbd1820098 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -24,11 +24,11 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -90,15 +90,24 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const ICLTensor *
                                                                float multiplier, int offset,
                                                                int min, int max)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel>();
-    k->configure(input, bias, output, multiplier, offset, min, max);
+    GEMMLowpOutputStageInfo info  = GEMMLowpOutputStageInfo();
+    info.gemmlowp_offset          = offset;
+    info.gemmlowp_real_multiplier = multiplier;
+    info.gemmlowp_min_bound       = min;
+    info.gemmlowp_max_bound       = max;
+
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
+    k->configure(input, bias, output, &info);
     _kernel = std::move(k);
 }
 
 Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
                                                                 int min, int max)
 {
-    return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(input, bias, output, min, max);
+    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
+    info.gemmlowp_min_bound      = min;
+    info.gemmlowp_max_bound      = max;
+    return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info);
 }
 
 void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
@@ -165,6 +174,13 @@ void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *b
             }
             break;
         }
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
+        {
+            auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
+            k->configure(input, bias, output, &info);
+            _kernel = std::move(k);
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
     }
@@ -202,6 +218,10 @@ Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorIn
                     return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
             }
         }
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
+        {
+            return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info);
+        }
         default:
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
     }
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
index 3d7c76aa2b..8aa81d0962 100644
--- a/tests/validation/CL/GEMMLowp.cpp
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -389,6 +389,46 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedP
 TEST_SUITE_END() // MultGreater1
 TEST_SUITE_END() // BoundedReLu
 TEST_SUITE_END() // QuantizeDownInt32ToInt16ScaleByFixedPoint
+
+TEST_SUITE(QuantizeDownInt32ScaleByFloat)
+
+TEST_SUITE(QASYMM8)
+using CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture =
+    GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage, uint8_t>;
+
+FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(framework::dataset::make("DataType", DataType::QASYMM8),
+                                                                       datasets::TinyShapes()),
+                                                               framework::dataset::make("result_real_multiplier", 0.33f)),
+                                                       framework::dataset::make("result_offset", 2, 3)),
+                                               framework::dataset::make("min", 0)),
+                                       framework::dataset::make("max", 255)),
+                               framework::dataset::make("addBias", { false, true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+using CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture_Signed =
+    GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage, int8_t>;
+FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture_Signed, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED),
+                                                                       datasets::TinyShapes()),
+                                                               framework::dataset::make("result_real_multiplier", 0.33f)),
+                                                       framework::dataset::make("result_offset", 2, 3)),
+                                               framework::dataset::make("min", -128)),
+                                       framework::dataset::make("max", 127)),
+                               framework::dataset::make("addBias", { false, true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // QuantizeDownInt32ScaleByFloat
+
 TEST_SUITE_END() // OutputStage
 TEST_SUITE_END() // GEMMLowp
 TEST_SUITE_END() // CL
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 0207f4c5ae..be9ce96dcb 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -556,6 +556,109 @@ protected:
     SimpleTensor<uint8_t> _reference{};
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(DataType data_type, TensorShape shape, float result_real_multiplier, int32_t result_offset, int32_t min, int32_t max, bool add_bias)
+    {
+        _target    = compute_target(data_type, shape, result_real_multiplier, result_offset, min, max, add_bias);
+        _reference = compute_reference(shape, result_real_multiplier, result_offset, min, max, add_bias);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        // To avoid data all being clampped
+        std::uniform_int_distribution<> distribution(-500, 500);
+        library->fill(tensor, distribution, i);
+    }
+
+    TensorType compute_target(DataType data_type, const TensorShape &shape, float result_multiplier, int32_t result_offset, int32_t min, int32_t max, bool add_bias)
+    {
+        TensorShape shape_bias(shape[0]);
+
+        // Create tensors
+        TensorType a = create_tensor<TensorType>(shape, DataType::S32, 1);
+        TensorType b = create_tensor<TensorType>(shape_bias, DataType::S32, 1);
+        TensorType c = create_tensor<TensorType>(shape, data_type, 1);
+
+        // create output stage info
+        GEMMLowpOutputStageInfo info;
+        info.gemmlowp_max_bound       = max;
+        info.gemmlowp_min_bound       = min;
+        info.gemmlowp_real_multiplier = result_multiplier;
+        info.gemmlowp_offset          = result_offset;
+        info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT;
+        info.output_data_type         = data_type;
+
+        // Create and configure function
+        FunctionType output_stage;
+        output_stage.configure(&a, add_bias ? &b : nullptr, &c, info);
+
+        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        c.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensor
+        fill(AccessorType(a), 0);
+
+        if(add_bias)
+        {
+            ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+            // Allocate bias tensor
+            b.allocator()->allocate();
+
+            ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+            // Fill tensor
+            fill(AccessorType(b), 1);
+        }
+
+        // Compute GEMM function
+        output_stage.run();
+        return c;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape, float_t result_real_multiplier, int32_t result_offset, int32_t min, int32_t max, bool add_bias)
+    {
+        // Create reference
+        TensorShape shape_bias(shape[0]);
+
+        SimpleTensor<int32_t> a{ shape, DataType::S32, 1 };
+        SimpleTensor<int32_t> b{ shape_bias, DataType::S32, 1 };
+
+        // Fill reference
+        fill(a, 0);
+
+        const std::vector<float_t> result_float_multiplier_vec = { result_real_multiplier };
+
+        if(add_bias)
+        {
+            // Fill bias
+            fill(b, 1);
+
+            return reference::gemmlowp_quantize_down_scale_by_float<int32_t, T>(a, b, result_float_multiplier_vec, result_offset, min, max);
+        }
+        else
+        {
+            return reference::gemmlowp_quantize_down_scale_by_float<int32_t, T>(a, result_float_multiplier_vec, result_offset, min, max);
+        }
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType>
 class GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture : public framework::Fixture
 {
diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp
index 99d08e34f1..61617c8aae 100644
--- a/tests/validation/reference/GEMMLowp.cpp
+++ b/tests/validation/reference/GEMMLowp.cpp
@@ -131,6 +131,39 @@ void quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> *in, const Simple
                                                     std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
     }
 }
+
+template <typename TIn, typename TOut>
+void quantize_down_scale_by_float(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, std::vector<float_t> result_real_multiplier,
+                                  int32_t result_offset, int32_t min, int32_t max)
+{
+    const int  cols_in        = in->shape().x();
+    const bool is_per_channel = result_real_multiplier.size() > 1;
+
+    for(int i = 0; i < in->num_elements(); ++i)
+    {
+        TIn result = (*in)[i];
+
+        if(bias != nullptr)
+        {
+            result += (*bias)[i % cols_in];
+        }
+
+        // Float multiplication
+        const float_t multiplier = (is_per_channel) ? result_real_multiplier[i % cols_in] : result_real_multiplier[0];
+
+        float_t result_f = static_cast<float_t>(result) * multiplier + static_cast<float_t>(result_offset);
+        result           = static_cast<TIn>(std::round(result_f));
+
+        // Bounded ReLu
+        if(min != max)
+        {
+            result = std::max(min, std::min(max, result));
+        }
+
+        (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
+                                                    std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
+    }
+}
 } // namespace
 
 template <typename T_out, typename T_in, typename T_in_1>
@@ -237,6 +270,36 @@ SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor
     return dst;
 }
 
+template <typename TIn, typename TOut>
+SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias,
+                                                         std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
+{
+    SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
+
+    quantize_down_scale_by_float<TIn, TOut>(&in, &bias, &dst, result_real_multiplier, result_offset, min, max);
+
+    return dst;
+}
+
+template <typename TIn, typename TOut>
+SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> &in,
+                                                         std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
+{
+    SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
+
+    quantize_down_scale_by_float<TIn, TOut>(&in, nullptr, &dst, result_real_multiplier, result_offset, min, max);
+
+    return dst;
+}
+
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
+                                                                     std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a,
+                                                                     std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
+template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
+                                                                    std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
+template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a,
+                                                                    std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
                                                                           std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
diff --git a/tests/validation/reference/GEMMLowp.h b/tests/validation/reference/GEMMLowp.h
index 7d711263e8..5de48dab52 100644
--- a/tests/validation/reference/GEMMLowp.h
+++ b/tests/validation/reference/GEMMLowp.h
@@ -59,6 +59,14 @@ SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor
 template <typename TIn, typename TOut>
 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
                                                               std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min = 0, int32_t max = 0);
+
+template <typename TIn, typename TOut>
+SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias,
+                                                         std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min = 0, int32_t max = 0);
+
+template <typename TIn, typename TOut>
+SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> &in,
+                                                         std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min = 0, int32_t max = 0);
 } // namespace reference
 } // namespace validation
 } // namespace test
author	Sheri Zhang <sheri.zhang@arm.com>	2020-03-09 14:29:52 +0000
committer	Sheri Zhang <sheri.zhang@arm.com>	2020-03-25 15:58:42 +0000
commit	1b14c75c0d591c4abe4d2d41b7e4e165fbf58382 (patch)
tree	41e671befde3f61247d0728d16907ff281d6294d
parent	2e5fd637205770ec5e11096e6e19b8efc67d544e (diff)
download	ComputeLibrary-1b14c75c0d591c4abe4d2d41b7e4e165fbf58382.tar.gz