3 files changed, 117 insertions, 49 deletions
diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/roi_pooling_layer.cl
index ac193e8fb6..6899b952e0 100644
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ b/src/core/CL/cl_kernels/roi_pooling_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "helpers.h"
+#include "helpers_asymm.h"
 
 #if DATA_SIZE == 32
 #define VEC_SIZE 4
@@ -29,24 +30,41 @@
 #elif DATA_SIZE == 16
 #define VEC_SIZE 8
 #define VEC_MAX vec8_max
-#else /* DATA_SIZE not equals 32 or 16 */
+#elif DATA_SIZE == 8
+#define VEC_SIZE 16
+#define VEC_MAX vec16_max
+#else /* DATA_SIZE not equals 8, 16, 32 */
 #error "Unsupported data size"
 #endif /* DATA_SIZE == 32 */
 
+// Define whether to use max (Quantized datatype) or fmax (Float) functions
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+#define MAX(x, y) max(x, y)
+#else // !(defined(OFFSET_OUT) && defined(SCALE_OUT)
+#define MAX(x, y) fmax(x, y)
+#endif // defined(OFFSET_OUT) && defined(SCALE_OUT)
+
 inline DATA_TYPE vec4_max(VEC_DATA_TYPE(DATA_TYPE, 4) vec)
 {
     VEC_DATA_TYPE(DATA_TYPE, 2)
-    temp = fmax(vec.lo, vec.hi);
-    return fmax(temp.x, temp.y);
+    temp = MAX(vec.lo, vec.hi);
+    return MAX(temp.x, temp.y);
 }
 
 inline DATA_TYPE vec8_max(VEC_DATA_TYPE(DATA_TYPE, 8) vec)
 {
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp = fmax(vec.lo, vec.hi);
+    temp = MAX(vec.lo, vec.hi);
     return vec4_max(temp);
 }
 
+inline DATA_TYPE vec16_max(VEC_DATA_TYPE(DATA_TYPE, 16) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp = MAX(vec.lo, vec.hi);
+    return vec8_max(temp);
+}
+
 /** Performs a roi pooling on a single output pixel.
  *
  * @param[in] input          Pointer to input Tensor3D struct.
@@ -69,7 +87,8 @@ inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int reg
     {
         int num_iter = (int)((region_end_x - region_start_x) / VEC_SIZE);
         VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        curr_max = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(-FLT_MAX);
+        curr_max = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(MIN_VALUE);
+
         for(int j = region_start_y; j < region_end_y; ++j)
         {
             int i = region_start_x;
@@ -77,27 +96,34 @@ inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int reg
             {
                 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
                 val      = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(input, i, j, pz));
-                curr_max = fmax(val, curr_max);
+                curr_max = MAX(val, curr_max);
             }
             for(; i < region_end_x; ++i)
             {
                 DATA_TYPE val = *(__global DATA_TYPE *)tensor3D_offset(input, i, j, pz);
-                curr_max      = fmax(curr_max, val);
+                curr_max      = MAX(curr_max, val);
             }
         }
-        return (DATA_TYPE)VEC_MAX(curr_max);
+
+        const DATA_TYPE temp = (DATA_TYPE)VEC_MAX(curr_max);
+
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+        return QUANTIZE(temp, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
+#endif /* if quantized, requantize and return */
+
+        return temp;
     }
 }
 
 /** Performs a roi pooling function.
  *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32, QASYMM8;
  * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
  * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
  * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
  * @note Spatial scale must be passed using -DSPATIAL_SCALE;
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32, QASYMM8
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
@@ -111,7 +137,7 @@ inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int reg
  * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
  * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
  * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as input
  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
@@ -139,9 +165,9 @@ __kernel void roi_pooling_layer(
 
     // Load roi parameters
     // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
-    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    roi               = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+    const ushort roi_batch = (ushort) * ((__global ushort *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(ushort, 4)
+    roi               = vload4(0, (__global ushort *)offset(&rois, 1, pw));
     const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
     const int2 roi_dims   = convert_int2_sat(fmax(round(convert_float2(roi.s23 - roi.s01) * (float)SPATIAL_SCALE), 1.f));
 
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 5867cde3bd..2deb8fac81 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -36,6 +36,7 @@
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
+#include <float.h>
 #include <cmath>
 #include <set>
 #include <string>
@@ -44,13 +45,13 @@ namespace arm_compute
 {
 namespace
 {
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto initialization if not yet initialized
     TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
-    auto_init_if_empty((*output), output_shape, 1, input->data_type());
+    auto_init_if_empty((*output), output_shape, 1, input->data_type(), output->quantization_info());
 
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 1;
@@ -70,31 +71,38 @@ CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
 {
 }
 
+Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+
+    //Validate arguments
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
+    ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
+    ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
+        ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
+    }
+
+    return Status{};
+}
+
 void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
-
-    //Validate arguments
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), rois->info(), output->info());
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
-    ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
-    ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
-    ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
-
-    if(output->info()->total_size() != 0)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
-        ARM_COMPUTE_ERROR_ON(rois->info()->dimension(1) != output->info()->dimension(3));
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
@@ -106,20 +114,39 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
     _output    = output;
     _pool_info = pool_info;
 
+    const DataType data_type = input->info()->data_type();
+    const bool     is_qasymm = is_data_type_quantized_asymmetric(data_type);
+
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX))));
-    build_opts.emplace(("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY))));
-    build_opts.emplace(("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ))));
-    build_opts.emplace(("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width())));
-    build_opts.emplace(("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())));
-    build_opts.emplace(("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale())));
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(data_type));
+    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX)));
+    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY)));
+    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ)));
+    build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
+    build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
+    build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale()));
+
+    if(is_qasymm)
+    {
+        // Determine quantization info scale, offset
+        UniformQuantizationInfo uqinfo = UniformQuantizationInfo();
+        uqinfo                         = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform());
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale));
+
+        // Specify minimum possible value of datatype
+        build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(0));
+    }
+    else{
+        // Specify min value of F32 datatype
+        build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(-FLT_MAX));
+    }
 
     // Create kernel
     std::string kernel_name = "roi_pooling_layer";
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts);
+    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static kernel arguments
     unsigned int idx = 2 * num_arguments_per_3D_tensor() + num_arguments_per_1D_array();
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
index 124ae3f268..7b7b457632 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
@@ -63,7 +63,7 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F16/F32.
+     * @param[in]  input           Source tensor. Data types supported: F16/F32/QASYMM8
      * @param[in]  rois            ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
      *                             as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
@@ -74,15 +74,30 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
+    /** Static Validate function to check inputs will lead to valid configuration of @ref CLROIPoolingLayer
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16/F32/QASYMM8
+     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
+     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+
 private:
     const ICLTensor    *_input;
     const ICLTensor    *_rois;
-    ICLTensor          *_output;
+    const ICLTensor    *_output;
     ROIPoolingLayerInfo _pool_info;
 };
 } // namespace arm_compute