From cc5171b85654b9f19a5f52bbe8abea0572ee0163 Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Wed, 9 Jan 2019 17:04:39 +0000
Subject: COMPMID-1677: Change ROIPooling layer interface to accept ROIs as
 tensors

Change-Id: If16b572a4d906187b77f32133a72a44316fa74e4
Reviewed-on: https://review.mlplatform.org/490
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 src/core/CL/cl_kernels/roi_pooling_layer.cl       | 28 ++++----
 src/core/CL/kernels/CLROIPoolingLayerKernel.cpp   | 82 +++++++++++++++--------
 src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp | 53 ++++++++++-----
 src/runtime/CL/functions/CLROIPoolingLayer.cpp    |  4 +-
 src/runtime/NEON/functions/NEROIPoolingLayer.cpp  |  9 +--
 5 files changed, 113 insertions(+), 63 deletions(-)

(limited to 'src')

diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/roi_pooling_layer.cl
index 042b102a15..0cf296c011 100644
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ b/src/core/CL/cl_kernels/roi_pooling_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,10 +105,12 @@ inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int reg
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source image as specifed by ROI
- * @param[in]  rois_ptr                             Pointer to the rois array. Layout: {x, y, width, height, batch_indx}
- * @param[in]  rois_stride_x                        Stride of the rois array in X dimension (in bytes)
- * @param[in]  rois_step_x                          rois_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the rois array
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
  * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
@@ -122,13 +124,13 @@ inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int reg
  */
 __kernel void roi_pooling_layer(
     TENSOR3D_DECLARATION(input),
-    VECTOR_DECLARATION(rois),
+    IMAGE_DECLARATION(rois),
     TENSOR3D_DECLARATION(output),
     unsigned int input_stride_w, unsigned int output_stride_w)
 {
     // Get pixels pointer
     Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    Vector   rois   = CONVERT_TO_VECTOR_STRUCT_NO_STEP(rois);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
 
     const int px = get_global_id(0);
@@ -136,12 +138,12 @@ __kernel void roi_pooling_layer(
     const int pw = get_global_id(2);
 
     // Load roi parameters
-    // roi is laid out as follows:
-    // { x, y, width, height, batch_index }
-    const ushort4 roi      = vload4(0, (__global ushort *)vector_offset(&rois, pw));
-    const ushort roi_batch = *((__global ushort *)vector_offset(&rois, pw) + 4);
-    const int2 roi_anchor  = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
-    const int2 roi_dims    = convert_int2_sat(fmax(round(convert_float2(roi.s23) * (float)SPATIAL_SCALE), 1.f));
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    roi               = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+    const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
+    const int2 roi_dims   = convert_int2_sat(fmax(round(convert_float2(roi.s23 - roi.s01) * (float)SPATIAL_SCALE), 1.f));
 
     // Calculate pooled region start and end
     const float2 spatial_indx     = (float2)(px, py);
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 23676942a6..df7687edea 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,29 +39,61 @@
 #include <set>
 #include <string>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto initialization if not yet initialized
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
+    auto_init_if_empty((*output), output_shape, 1, input->data_type());
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 1;
+    Window             win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input_access(input, input->valid_region().start(0), num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
     : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
 {
 }
 
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
+
+    //Validate arguments
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), rois->info(), output->info());
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
+    ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
+    ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
     ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
-    ARM_COMPUTE_ERROR_ON(rois->num_values() == 0);
 
-    // Output auto inizialitation if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+    if(output->info()->total_size() != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(rois->info()->dimension(1) != output->info()->dimension(3));
+    }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(rois->num_values() != output->info()->dimension(3));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     // Set instance variables
     _input     = input;
@@ -89,19 +121,7 @@ void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLROIArra
     add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
     add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
 
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 1;
-    Window             window                            = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input_access(input->info(),
-                                    input->info()->valid_region().start(0),
-                                    input->info()->valid_region().start(1),
-                                    input->info()->valid_region().end(0),
-                                    input->info()->valid_region().end(1));
-    AccessWindowStatic output_access(output->info(), 0, 0, pool_info.pooled_width(), pool_info.pooled_height());
-
-    update_window_and_padding(window, input_access, output_access);
-    output_access.set_valid_region(window, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    ICLKernel::configure_internal(window);
+    ICLKernel::configure_internal(win_config.second);
 }
 
 void CLROIPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -109,14 +129,20 @@ void CLROIPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    Window slice = window.first_slice_window_3D();
-    // Parallelize spatially and across the fourth dimension of the output tensor (also across ROIArray)
+    Window slice      = window.first_slice_window_3D();
+    Window slice_rois = slice;
+    // Parallelize spatially and across the fourth dimension of the output tensor (also across ROITensor)
+    slice_rois.set_dimension_step(Window::DimX, _rois->info()->dimension(0));
     slice.set(Window::DimZ, window[3]);
 
     // Set arguments
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input, slice);
-    add_1D_array_argument<ROI>(idx, _rois, Strides(sizeof(ROI)), 1U, slice);
+    add_2D_tensor_argument(idx, _rois, slice_rois);
     add_3D_tensor_argument(idx, _output, slice);
+    add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
+    add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+
     enqueue(queue, *this, slice);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 4d908db77b..6fd6792ff8 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,22 +35,35 @@
 #include <cfloat>
 #include <cmath>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEROIPoolingLayerKernel::NEROIPoolingLayerKernel()
     : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
 {
 }
 
-void NEROIPoolingLayerKernel::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
+
+    //Validate arguments
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), rois->info(), output->info());
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
+    ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
+    ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
-    ARM_COMPUTE_ERROR_ON(rois->num_values() == 0);
 
-    // Output auto inizialitation if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
+    if(output->info()->total_size() != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(rois->info()->dimension(1) != output->info()->dimension(3));
+    }
+
+    // Output auto initialization if not yet initialized
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -64,7 +77,7 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const IROIArray *r
 
     // Configure kernel window
     Window window;
-    window.set(Window::DimX, Window::Dimension(0, rois->num_values()));
+    window.set(Window::DimX, Window::Dimension(0, rois->info()->dimension(1)));
     window.set(Window::DimY, Window::Dimension(0, 1));
 
     AccessWindowStatic input_access(input->info(),
@@ -85,6 +98,8 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
+    const size_t values_per_roi = _rois->info()->dimension(0);
+
     const int   roi_list_start = window.x().start();
     const int   roi_list_end   = window.x().end();
     const int   width          = _input->info()->dimension(Window::DimX);
@@ -94,16 +109,21 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
     const int   pooled_h       = _pool_info.pooled_height();
     const float spatial_scale  = _pool_info.spatial_scale();
 
+    const auto *rois_ptr = reinterpret_cast<const uint16_t *>(_rois->buffer());
+
     for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
     {
-        const ROI &curr_roi = _rois->at(roi_indx);
+        const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
+        const auto         x1        = rois_ptr[values_per_roi * roi_indx + 1];
+        const auto         y1        = rois_ptr[values_per_roi * roi_indx + 2];
+        const auto         x2        = rois_ptr[values_per_roi * roi_indx + 3];
+        const auto         y2        = rois_ptr[values_per_roi * roi_indx + 4];
 
         // Scale ROI
-        const int roi_batch    = curr_roi.batch_idx;
-        const int roi_anchor_x = support::cpp11::round(curr_roi.rect.x * spatial_scale);
-        const int roi_anchor_y = support::cpp11::round(curr_roi.rect.y * spatial_scale);
-        const int roi_width    = std::max(support::cpp11::round(curr_roi.rect.width * spatial_scale), 1.f);
-        const int roi_height   = std::max(support::cpp11::round(curr_roi.rect.height * spatial_scale), 1.f);
+        const int roi_anchor_x = support::cpp11::round(x1 * spatial_scale);
+        const int roi_anchor_y = support::cpp11::round(y1 * spatial_scale);
+        const int roi_width    = std::max(support::cpp11::round((x2 - x1) * spatial_scale), 1.f);
+        const int roi_height   = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f);
 
         // Iterate through all feature maps
         for(int fm = 0; fm < fms; ++fm)
@@ -146,3 +166,4 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
         }
     }
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 0f480eeac9..7bb41784ac 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,7 @@
 
 using namespace arm_compute;
 
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     // Configure ROI pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index 1f1400cf42..3aca4b7b60 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,14 +27,14 @@
 #include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEROIPoolingLayer::NEROIPoolingLayer()
     : _roi_kernel()
 {
 }
 
-void NEROIPoolingLayer::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     _roi_kernel.configure(input, rois, output, pool_info);
 }
@@ -43,3 +43,4 @@ void NEROIPoolingLayer::run()
 {
     NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
 }
+} // namespace arm_compute
\ No newline at end of file
-- 
cgit v1.2.1