From e03802edd37229a1868bacedd7571cc443810caf Mon Sep 17 00:00:00 2001
From: Usama Arif <usama.arif@arm.com>
Date: Mon, 11 Mar 2019 12:20:20 +0000
Subject: COMPMID-1936: Add support for QASYMM8 in CLQuantizeLayer.

Change-Id: I9aa1f1f1753bcdee6a74ec15b4fb366f823788b4
Signed-off-by: Usama Arif <usama.arif@arm.com>
Reviewed-on: https://review.mlplatform.org/c/850
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 .../core/CL/kernels/CLQuantizationLayerKernel.h    | 21 ++---
 .../runtime/CL/functions/CLQuantizationLayer.h     | 31 ++------
 src/core/CL/cl_kernels/quantization_layer.cl       | 80 ++++++++++---------
 src/core/CL/kernels/CLQuantizationLayerKernel.cpp  | 90 +++++++++++-----------
 src/runtime/CL/functions/CLQuantizationLayer.cpp   | 52 +++----------
 tests/benchmark/CL/QuantizationLayer.cpp           |  4 +-
 .../benchmark/fixtures/QuantizationLayerFixture.h  |  6 +-
 tests/validation/CL/QuantizationLayer.cpp          | 48 ++++++++----
 tests/validation/NEON/QuantizationLayer.cpp        |  2 +-
 .../validation/fixtures/QuantizationLayerFixture.h | 62 ---------------
 tests/validation/reference/QuantizationLayer.cpp   | 50 +-----------
 tests/validation/reference/QuantizationLayer.h     |  3 -
 12 files changed, 155 insertions(+), 294 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
index 5d78dce1c2..d16ae546ff 100644
--- a/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,24 +49,20 @@ public:
     CLQuantizationLayerKernel &operator=(CLQuantizationLayerKernel &&) = default;
     /** Default destructor */
     ~CLQuantizationLayerKernel() = default;
-    /** Set the input, output, min and max.
+    /** Set the input, output.
      *
-     * @param[in]  input   Source tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data types supported: F32.
-     * @param[out] output  Destination tensor with the same dimensions of input. Output data type must be U8.
-     * @param[in]  min_max Pointer to the tensor with shape [2, batches] which stores the minimum and maximum value for each 3D input tensor.
-     *                     The dimensions over the second must match the batched dimensions of the input tensor. Data type supported: F32.
+     * @param[in]  input  Source tensor. Data types supported: F32/F16.
+     * @param[out] output Destination tensor with the same dimensions of input. Output data type must be QASYMM8.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *min_max);
+    void configure(const ICLTensor *input, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLQuantizationLayerKernel
      *
-     * @param[in] input   Input tensor info. Data types supported: F32.
-     * @param[in] output  Output tensor info. Output data type must be U8.
-     * @param[in] min_max Info for the tensor with shape [2, batches] which stores the minimum and maximum value for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data type supported: F32.
+     * @param[in] input  Input tensor info. Data types supported: F32/F16.
+     * @param[in] output Output tensor info. Output data type must be QASYMM8.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -74,7 +70,6 @@ public:
 private:
     const ICLTensor *_input;
     ICLTensor       *_output;
-    const ICLTensor *_min_max;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
index 738187dfe7..81dcfad515 100644
--- a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,7 @@
 #ifndef __ARM_COMPUTE_CLQUANTIZATIONLAYER_H__
 #define __ARM_COMPUTE_CLQUANTIZATIONLAYER_H__
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -38,37 +34,26 @@ class ICLTensor;
  *
  * @note The implementation supports only 3D input tensors.
  *
- * -# @ref CLMinMaxLayerKernel
  * -# @ref CLQuantizationLayerKernel
  *
  */
-class CLQuantizationLayer : public IFunction
+class CLQuantizationLayer : public ICLSimpleFunction
 {
 public:
-    /** Default constructor */
-    CLQuantizationLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data types supported: F32.
-     * @param[out] output Destination tensor with the same dimensions of input. Output data type must be U8.
+     * @param[in]  input  Source tensor. Data types supported: F16/32.
+     * @param[out] output Destination tensor with the same dimensions of input. Output data type must be QASYMM8.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLQuantizationLayer
      *
-     * @param[in] input  Input tensor info. The dimensions over the third will be interpreted as batches. Data types supported: F32.
-     * @param[in] output Output tensor info. Output data type must be U8.
+     * @param[in] input  Input tensor info. The dimensions over the third will be interpreted as batches. Data types supported: F16/32.
+     * @param[in] output Output tensor info. Output data type must be QASYMM8.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLQuantizationLayerKernel _quantize_kernel;
-    CLMinMaxLayerKernel       _min_max_kernel;
-    CLTensor                  _min_max;
 };
-}
+} //namespace arm_compute
 #endif /* __ARM_COMPUTE_CLQUANTIZATIONLAYER_H__ */
diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/quantization_layer.cl
index 80ea54012f..7ae34ef71a 100644
--- a/src/core/CL/cl_kernels/quantization_layer.cl
+++ b/src/core/CL/cl_kernels/quantization_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,53 +23,63 @@
  */
 #include "helpers.h"
 
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
+#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
+
 /** This performs the quantization of floating point inputs to 8-bit unsigned integers.
  *
- * @param[in]  input_ptr                             Pointer to the source image. Supported data types: F32
- * @param[in]  input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] output_ptr                            Pointer to the destination image. Supported data types: U8
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  min_max_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
- * @param[in]  min_max_stride_x                      Stride of the min/max vector in X dimension (in bytes)
- * @param[in]  min_max_step_x                        min_max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  min_max_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: U8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void quantization_layer(
     TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    VECTOR_DECLARATION(min_max))
+    TENSOR3D_DECLARATION(output))
 {
     // Get pixels pointer
     Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    // min_max_value.s0 = min, min_max_value.s1 = max
-    const float2 min_max_value = vload2(0, (__global float *)(min_max_ptr + min_max_offset_first_element_in_bytes));
-
-    const float4 vmin   = (float4)min_max_value.s0;
-    const float4 vrange = (float4)(min_max_value.s1 - min_max_value.s0);
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
 
     // Load data
-    float4 data = vload4(0, (__global float *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
 
-    // Map float values to range [0.0, 1.0]
-    data = (data - vmin) / vrange;
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) vscale = SCALE;
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset      = OFFSET;
 
-    // Quantize and saturate
-    uchar4 res = convert_uchar4_sat(data * 256.0f);
+    // Quantize
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res = CLAMP(CONVERT_RTE_VEC(val / vscale, int, VEC_SIZE) + voffset, 0, 255);
 
-    // Store result
-    vstore4(res, 0, (__global uchar *)output.ptr);
+    //Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)output.ptr);
+#else  //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global uchar *)(output.ptr)) = (uchar)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, 0, 255);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
+#endif //defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index 9028b0f604..374b22eab1 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
@@ -36,73 +37,76 @@ using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
-    if(output->tensor_shape().total_size() > 0)
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    if((output != nullptr) && (output->total_size() != 0))
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
     }
 
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    // Configure window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
 
-    // Update window and padding
-    bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::QASYMM8);
 
-    output_access.set_valid_region(win, input->valid_region());
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win);
+    return std::make_tuple(Status{}, win);
 }
 } // namespace
 
 CLQuantizationLayerKernel::CLQuantizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _min_max(nullptr)
+    : _input(nullptr), _output(nullptr)
 {
 }
 
-void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *min_max)
+void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
-    _input   = input;
-    _output  = output;
-    _min_max = min_max;
+    _input  = input;
+    _output = output;
 
-    // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("quantization_layer"));
+    const int  vec_size_x     = 16 / input->info()->element_size();
+    const int  input_width_x  = input->info()->tensor_shape().x();
+    const bool multi_access_x = (input_width_x / vec_size_x > 0);
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+    // Create and update the window (if needed)
+    Window win = calculate_max_window(*input->info());
+    if(multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
 
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(output->info()->quantization_info().offset));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("quantization_layer", build_opts.options()));
 }
 
-Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
     return Status{};
 }
@@ -117,13 +121,9 @@ void CLQuantizationLayerKernel::run(const Window &window, cl::CommandQueue &queu
 
     do
     {
-        Window slice_min_max = slice.shift_dimensions(2);
-        slice_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
-
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
-        add_1D_tensor_argument(idx, _min_max, slice_min_max);
         enqueue(queue, *this, slice);
     }
     while(window_collapsed.slide_window_slice_3D(slice));
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index a13859cda3..df10e1e748 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,54 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
+#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
-CLQuantizationLayer::CLQuantizationLayer()
-    : _quantize_kernel(), _min_max_kernel(), _min_max()
+namespace arm_compute
 {
-}
-
-Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
-    TensorInfo min_max{ input->num_channels(), input->data_type() };
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMinMaxLayerKernel::validate(input, &min_max));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(input, output, &min_max));
-
-    return Status{};
-}
-
 void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
-    _min_max_kernel.configure(input, &_min_max);
-
-    // Configure quantize kernel
-    _quantize_kernel.configure(input, output, &_min_max);
-
-    // Allocate min_max tensor
-    _min_max.allocator()->allocate();
+    auto k = arm_compute::support::cpp14::make_unique<CLQuantizationLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
 }
 
-void CLQuantizationLayer::run()
+Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    // Reset min and max
-    _min_max_kernel.reset(q);
-
-    // Run min-max kernel
-    CLScheduler::get().enqueue(_min_max_kernel, false);
-
-    // Run quantize kernel
-    CLScheduler::get().enqueue(_quantize_kernel, false);
+    return CLQuantizationLayerKernel::validate(input, output);
 }
+} // namespace arm_compute
diff --git a/tests/benchmark/CL/QuantizationLayer.cpp b/tests/benchmark/CL/QuantizationLayer.cpp
index 2dc775af0a..f52e6f078d 100644
--- a/tests/benchmark/CL/QuantizationLayer.cpp
+++ b/tests/benchmark/CL/QuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ namespace benchmark
 {
 namespace
 {
-const auto data_types = framework::dataset::make("DataType", { DataType::F32 });
+const auto data_types = framework::dataset::make("DataType", { DataType::F32, DataType::F16 });
 } // namespace
 
 using CLQuantizationLayerFixture = QuantizationLayerFixture<CLTensor, CLQuantizationLayer, CLAccessor>;
diff --git a/tests/benchmark/fixtures/QuantizationLayerFixture.h b/tests/benchmark/fixtures/QuantizationLayerFixture.h
index 4b2fc88602..f2e8889423 100644
--- a/tests/benchmark/fixtures/QuantizationLayerFixture.h
+++ b/tests/benchmark/fixtures/QuantizationLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,9 +43,11 @@ public:
     template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
+        const QuantizationInfo q_info(0.5f, -10);
+
         // Create tensors
         src = create_tensor<TensorType>(shape, data_type);
-        dst = create_tensor<TensorType>(shape, DataType::U8);
+        dst = create_tensor<TensorType>(shape, DataType::QASYMM8, 1, q_info);
 
         // Create and configure function
         quantization_func.configure(&src, &dst);
diff --git a/tests/validation/CL/QuantizationLayer.cpp b/tests/validation/CL/QuantizationLayer.cpp
index f0cc4ccafa..26e030489c 100644
--- a/tests/validation/CL/QuantizationLayer.cpp
+++ b/tests/validation/CL/QuantizationLayer.cpp
@@ -53,21 +53,17 @@ TEST_SUITE(QuantizationLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::U8),  // Wrong input data type
-                                                       TensorInfo(TensorShape(16U, 5U, 16U), 1, DataType::U8),       // Invalid shape
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8),  // Wrong input data type
                                                        TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32), // Wrong output data type
-                                                       TensorInfo(TensorShape(16U, 16U, 2U, 5U), 1, DataType::U8),   // Mismatching shapes
-                                                       TensorInfo(TensorShape(17U, 16U, 16U, 5U), 1, DataType::U8),  // Shrink window
+                                                       TensorInfo(TensorShape(16U, 16U, 2U, 5U), 1, DataType::F32),   // Mismatching shapes
                                                        TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32), // Valid
                                                      }),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 5U, 16U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::U16),
-                                                       TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(17U, 16U, 16U, 5U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::U8),
+                                                       TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8),
                                                      })),
-               framework::dataset::make("Expected", { false, false, false, false, false, true})),
+               framework::dataset::make("Expected", { false, false, false, true})),
                input_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLQuantizationLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
@@ -79,7 +75,7 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(QuantizationS
 {
     // Create tensors
     CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::U8);
+    CLTensor dst = create_tensor<CLTensor>(shape, DataType::QASYMM8);
 
     ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -94,9 +90,8 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(QuantizationS
     validate(dst.info()->valid_region(), valid_region);
 
     // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 4).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
+    validate(src.info()->padding(), PaddingSize());
+    validate(dst.info()->padding(), PaddingSize());
 }
 
 template <typename T>
@@ -104,19 +99,38 @@ using CLQuantizationLayerFixture = QuantizationValidationFixture<CLTensor, CLAcc
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(concat(datasets::Small3DShapes(), datasets::Small4DShapes()),
-                                                                                                               framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(concat(datasets::Small3DShapes(), datasets::Small4DShapes()),
+                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
+                                                                                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizationLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(concat(datasets::Large3DShapes(), datasets::Large4DShapes()),
-                                                                                                             framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizationLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(concat(datasets::Large3DShapes(), datasets::Large4DShapes()),
+                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
+                                                                                                             framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(concat(datasets::Small3DShapes(), datasets::Small4DShapes()),
+                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
+                                                                                                              framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(concat(datasets::Large3DShapes(), datasets::Large4DShapes()),
+                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
+                                                                                                            framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // QuantizationLayer
diff --git a/tests/validation/NEON/QuantizationLayer.cpp b/tests/validation/NEON/QuantizationLayer.cpp
index 487eb70120..0b503c09b3 100644
--- a/tests/validation/NEON/QuantizationLayer.cpp
+++ b/tests/validation/NEON/QuantizationLayer.cpp
@@ -97,7 +97,7 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(QuantizationS
 }
 
 template <typename T>
-using NEQuantizationLayerFixture = QAsymm8QuantizationValidationFixture<Tensor, Accessor, NEQuantizationLayer, T>;
+using NEQuantizationLayerFixture = QuantizationValidationFixture<Tensor, Accessor, NEQuantizationLayer, T>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
diff --git a/tests/validation/fixtures/QuantizationLayerFixture.h b/tests/validation/fixtures/QuantizationLayerFixture.h
index 65de405788..84d4d7a7b3 100644
--- a/tests/validation/fixtures/QuantizationLayerFixture.h
+++ b/tests/validation/fixtures/QuantizationLayerFixture.h
@@ -45,68 +45,6 @@ namespace validation
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class QuantizationValidationFixture : public framework::Fixture
 {
-public:
-    template <typename...>
-    void setup(TensorShape shape, DataType data_type)
-    {
-        _target    = compute_target(shape, data_type);
-        _reference = compute_reference(shape, data_type);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor)
-    {
-        library->fill_tensor_uniform(tensor, 0);
-    }
-
-    TensorType compute_target(const TensorShape &shape, DataType data_type)
-    {
-        // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, DataType::U8);
-
-        // Create and configure function
-        FunctionType quantization_layer;
-        quantization_layer.configure(&src, &dst);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src));
-
-        // Compute function
-        quantization_layer.run();
-
-        return dst;
-    }
-
-    SimpleTensor<uint8_t> compute_reference(const TensorShape &shape, DataType data_type)
-    {
-        // Create reference
-        SimpleTensor<T> src{ shape, data_type };
-
-        // Fill reference
-        fill(src);
-
-        return reference::quantization_layer<T>(src);
-    }
-
-    TensorType            _target{};
-    SimpleTensor<uint8_t> _reference{};
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class QAsymm8QuantizationValidationFixture : public framework::Fixture
-{
 public:
     template <typename...>
     void setup(TensorShape shape, DataType data_type, QuantizationInfo quant_info)
diff --git a/tests/validation/reference/QuantizationLayer.cpp b/tests/validation/reference/QuantizationLayer.cpp
index 3d6c5bc13d..2f3348178c 100644
--- a/tests/validation/reference/QuantizationLayer.cpp
+++ b/tests/validation/reference/QuantizationLayer.cpp
@@ -33,55 +33,6 @@ namespace validation
 {
 namespace reference
 {
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<T> &src)
-{
-    // Create reference
-    SimpleTensor<uint8_t> dst{ src.shape(), DataType::U8 };
-
-    const int width       = src.shape().x();
-    const int height      = src.shape().y();
-    const int depth       = src.shape().z();
-    const int stride_w    = width * height * depth;
-    const int num_batches = src.shape().total_size_upper(3);
-
-    for(int k = 0; k < num_batches; ++k)
-    {
-        // Compute min and max of the 3D tensor
-        float min = src[k * stride_w];
-        float max = src[k * stride_w];
-
-        // Look for min and max values
-        for(int i = 1; i < stride_w; ++i)
-        {
-            float val = src[i + k * stride_w];
-            min       = std::min(min, val);
-            max       = std::max(max, val);
-        }
-
-        // Saturate the result in case min = max
-        if(min == max)
-        {
-            min = 0.0f;
-            max = 1.0f;
-        }
-
-        const float range = max - min;
-
-        for(int i = 0; i < stride_w; ++i)
-        {
-            // map values to range [0.0, 1.0]
-            float       val        = src[i + k * stride_w];
-            const float normalized = (val - min) / range;
-            dst[i + k * stride_w]  = static_cast<uint8_t>(std::min(255.0f, normalized * 256.0f));
-        }
-    }
-
-    return dst;
-}
-
-template SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<float> &src);
-
 template <typename T>
 SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<T> &src, const QuantizationInfo quantization_info)
 {
@@ -98,6 +49,7 @@ SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<T> &src, const Quant
     }
     return dst;
 }
+
 template SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<half> &src, const QuantizationInfo quantization_info);
 template SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<float> &src, const QuantizationInfo quantization_info);
 } // namespace reference
diff --git a/tests/validation/reference/QuantizationLayer.h b/tests/validation/reference/QuantizationLayer.h
index 60d8ea4023..2d136908af 100644
--- a/tests/validation/reference/QuantizationLayer.h
+++ b/tests/validation/reference/QuantizationLayer.h
@@ -35,9 +35,6 @@ namespace validation
 {
 namespace reference
 {
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<T> &src);
-
 template <typename T>
 SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<T> &src, const QuantizationInfo quantization_info);
 } // namespace reference
-- 
cgit v1.2.1