COMPMID-1667: Add 4D tensors support to CLWidthConcatenateLayerKernel

Change-Id: Ibc0b1242804c2fdb183825406e3c78bd0d1d3564 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/154368 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Tested-by: bsgcomp <bsgcomp@arm.com>
author: Michele Di Giorgio <michele.digiorgio@arm.com> 2018-10-19 15:46:19 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:55:45 +0000
commit: e6dbde0128bf33b5d72a00c480bd92c290fd17b7 (patch)
tree: 1e5f501a536b7b991ff54430f377b37e37d6d929
parent: bcf8a968da4b26926df8bb770df16d82146bcb54 (diff)
download: ComputeLibrary-e6dbde0128bf33b5d72a00c480bd92c290fd17b7.tar.gz
5 files changed, 50 insertions, 35 deletions
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 16c4363899..a232a94dfc 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -23,12 +23,15 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE)
-#if defined(WIDTH_OFFSET)
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#if defined(WIDTH_OFFSET) && defined(DEPTH)
 /** This kernel concatenates the input tensor into the output tensor along the first dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
  * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH16
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -37,6 +40,8 @@
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
@@ -45,15 +50,17 @@
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  offset                            The offset to the first valid element of the output tensor in bytes
  */
 __kernel void concatenate_width(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
@@ -61,10 +68,13 @@ __kernel void concatenate_width(
     VSTORE(VEC_SIZE)
     (source_values, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
 }
-#endif // defined(WIDTH_OFFSET)
+#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
 
 /** This kernel concatenates the input tensor into the output tensor along the third dimension
  *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -97,4 +107,4 @@ __kernel void concatenate_depth(
     VSTORE(VEC_SIZE)
     (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z));
 }
-#endif // defined(DATA_TYPE)
-\ No newline at end of file
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index e5ab8d2304..c51c5796d1 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
@@ -53,8 +53,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
     AccessWindowHorizontal output_access(output, width_offset, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, output_access);
 
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, win_collapsed);
 }
 Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
 {
@@ -69,7 +71,7 @@ Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, c
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
     return Status{};
 }
@@ -103,6 +105,7 @@ void CLWidthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned i
     build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width", build_opts.options()));
@@ -119,14 +122,8 @@ void CLWidthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, window);
+    add_4D_tensor_argument(idx, _output, window);
+    enqueue(queue, *this, window);
 }
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 483fada5d8..ffacf34620 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -730,7 +730,8 @@ public:
     {
         TensorShape{ 232U, 65U, 3U },
                      TensorShape{ 432U, 65U, 3U },
-                     TensorShape{ 124U, 65U, 3U }
+                     TensorShape{ 124U, 65U, 3U },
+                     TensorShape{ 124U, 65U, 3U, 4U }
     })
     {
     }
diff --git a/tests/validation/CL/WidthConcatenateLayer.cpp b/tests/validation/CL/WidthConcatenateLayer.cpp
index 6af3c64f73..6ff1dfca54 100644
--- a/tests/validation/CL/WidthConcatenateLayer.cpp
+++ b/tests/validation/CL/WidthConcatenateLayer.cpp
@@ -103,14 +103,16 @@ using CLWidthConcatenateLayerFixture = WidthConcatenateLayerValidationFixture<CL
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWidthConcatenateLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                                  DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWidthConcatenateLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(concat(datasets::Small2DShapes(), datasets::Tiny4DShapes()),
+                                                                                                                  framework::dataset::make("DataType",
+                                                                                                                          DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLWidthConcatenateLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                                DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWidthConcatenateLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(concat(datasets::Large2DShapes(), datasets::Small4DShapes()),
+                                                                                                                framework::dataset::make("DataType",
+                                                                                                                        DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -118,8 +120,9 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWidthConcatenateLayerFixture<half>, framework
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLWidthConcatenateLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                                   DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWidthConcatenateLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(concat(datasets::Small2DShapes(), datasets::Tiny4DShapes()),
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::F32)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/reference/WidthConcatenateLayer.cpp b/tests/validation/reference/WidthConcatenateLayer.cpp
index 8662199306..6be171b64d 100644
--- a/tests/validation/reference/WidthConcatenateLayer.cpp
+++ b/tests/validation/reference/WidthConcatenateLayer.cpp
@@ -59,20 +59,24 @@ SimpleTensor<T> widthconcatenate_layer(const std::vector<SimpleTensor<T>> &srcs)
     {
         ARM_COMPUTE_ERROR_ON(width_offset >= width_out);
 
-        const int width  = src.shape().x();
-        const int height = src.shape().y();
-        const int depth  = src.shape().z();
+        const int width      = src.shape().x();
+        const int height     = src.shape().y();
+        const int depth      = src.shape().z();
+        const int upper_dims = src.shape().total_size() / (width * height * depth);
 
         const T *src_ptr = src.data();
         T       *dst_ptr = dst.data();
 
-        for(int d = 0; d < depth; ++d)
+        for(int u = 0; u < upper_dims; ++u)
         {
-            for(int r = 0; r < height; ++r)
+            for(int d = 0; d < depth; ++d)
             {
-                int offset = d * height + r;
-                std::copy(src_ptr, src_ptr + width, dst_ptr + width_offset + offset * width_out);
-                src_ptr += width;
+                for(int r = 0; r < height; ++r)
+                {
+                    const int offset = u * height * depth + d * height + r;
+                    std::copy(src_ptr, src_ptr + width, dst_ptr + width_offset + offset * width_out);
+                    src_ptr += width;
+                }
             }
         }
author	Michele Di Giorgio <michele.digiorgio@arm.com>	2018-10-19 15:46:19 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:55:45 +0000
commit	e6dbde0128bf33b5d72a00c480bd92c290fd17b7 (patch)
tree	1e5f501a536b7b991ff54430f377b37e37d6d929
parent	bcf8a968da4b26926df8bb770df16d82146bcb54 (diff)
download	ComputeLibrary-e6dbde0128bf33b5d72a00c480bd92c290fd17b7.tar.gz