aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2018-11-01 13:44:05 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2018-11-13 14:49:34 +0000
commit27400b90a9cb3fe028c5b724b58ce0e82d89b5e8 (patch)
tree4b7dd9d4b29653ada018172dae826fe3e6ef5e08
parentbb081cac4f386eb6db6e9927fce27c7027dd7be5 (diff)
downloadComputeLibrary-27400b90a9cb3fe028c5b724b58ce0e82d89b5e8.tar.gz
COMPMID-1707: Create 3 special CLWidthConcatenate kernel to concatenate 2/4 and 8 tensors (Part 1)
Creating special cases for concatening 2 and 4 tensors. Change-Id: I6a739a494ae45011acb65369e353f9ef96970b90
-rw-r--r--arm_compute/core/CL/CLKernels.h2
-rw-r--r--arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h79
-rw-r--r--arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h85
-rw-r--r--arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h8
-rw-r--r--src/core/CL/CLKernelLibrary.cpp2
-rw-r--r--src/core/CL/cl_kernels/concatenate.cl208
-rw-r--r--src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp151
-rw-r--r--src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp171
-rw-r--r--src/runtime/CL/functions/CLWidthConcatenateLayer.cpp75
-rw-r--r--tests/validation/fixtures/WidthConcatenateLayerFixture.h2
10 files changed, 764 insertions, 19 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 36abb7bd78..df76366a4b 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -127,6 +127,8 @@
#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
new file mode 100644
index 0000000000..cc2eaa25f2
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H__
+#define __ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the width concatenate kernel of 2 tensors.
+ * The input1 and input2 tensors will be concatenated into the output tensor.
+ */
+class CLWidthConcatenate2TensorsKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLWidthConcatenate2TensorsKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLWidthConcatenate2TensorsKernel(const CLWidthConcatenate2TensorsKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLWidthConcatenate2TensorsKernel &operator=(const CLWidthConcatenate2TensorsKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLWidthConcatenate2TensorsKernel(CLWidthConcatenate2TensorsKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLWidthConcatenate2TensorsKernel &operator=(CLWidthConcatenate2TensorsKernel &&) = default;
+ /** Default destructor */
+ ~CLWidthConcatenate2TensorsKernel() = default;
+ /** Initialise the kernel's input1s and output
+ *
+ * @param[in] input1 First input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input2 Second input tensor. Data types supported: same as @p input1
+ * @param[out] output Output tensor. Data types supported: Same as @p input1.
+ */
+ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate2TensorsKernel
+ *
+ * @param[in] input1 First tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input2 Second tensor info. Data types supported: same as @p input1
+ * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
new file mode 100644
index 0000000000..952fd99beb
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H__
+#define __ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the width concatenate kernel of 4 tensors.
+ * All input tensors will be concatenated into the output tensor.
+ */
+class CLWidthConcatenate4TensorsKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLWidthConcatenate4TensorsKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLWidthConcatenate4TensorsKernel(const CLWidthConcatenate4TensorsKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLWidthConcatenate4TensorsKernel &operator=(const CLWidthConcatenate4TensorsKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLWidthConcatenate4TensorsKernel(CLWidthConcatenate4TensorsKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLWidthConcatenate4TensorsKernel &operator=(CLWidthConcatenate4TensorsKernel &&) = default;
+ /** Default destructor */
+ ~CLWidthConcatenate4TensorsKernel() = default;
+ /** Initialise the kernel's input1s and output
+ *
+ * @param[in] input1 First input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input2 Second input tensor. Data types supported: same as @p input1
+ * @param[in] input3 Third input tensor. Data types supported: same as @p input1
+ * @param[in] input4 Fourth input tensor. Data types supported: same as @p input1
+ * @param[out] output Output tensor. Data types supported: Same as @p input1.
+ */
+ void configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate4TensorsKernel
+ *
+ * @param[in] input1 First tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input2 Second tensor info. Data types supported: same as @p input1
+ * @param[in] input3 Third tensor info. Data types supported: same as @p input1
+ * @param[in] input4 Fourth tensor info. Data types supported: same as @p input1
+ * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input1;
+ const ICLTensor *_input2;
+ const ICLTensor *_input3;
+ const ICLTensor *_input4;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
index 44462b02b2..55b65dadc4 100644
--- a/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
@@ -29,6 +29,8 @@
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
#include <memory>
@@ -40,7 +42,9 @@ class ICLTensor;
/** Basic function to execute concatenate tensors along x axis. This function calls the following kernel:
*
- * -# @ref CLDepthConcatenateLayerKernel
+ * -# @ref CLWidthConcatenateLayerKernel
+ * -# @ref CLWidthConcatenate2TensorsKernel (if there are exactly 2 input tensors)
+ * -# @ref CLWidthConcatenate4TensorsKernel (if there are exactly 4 input tensors)
*
*/
class CLWidthConcatenateLayer : public IFunction
@@ -74,6 +78,8 @@ public:
private:
std::unique_ptr<CLWidthConcatenateLayerKernel[]> _concat_kernels_vector;
+ CLWidthConcatenate2TensorsKernel _concat_x2_kernel;
+ CLWidthConcatenate4TensorsKernel _concat_x4_kernel;
unsigned int _num_inputs;
};
} // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index b9b3ce970b..847236925a 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -182,6 +182,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "combine_gradients_L2", "canny.cl" },
{ "concatenate_depth", "concatenate.cl" },
{ "concatenate_width", "concatenate.cl" },
+ { "concatenate_width_x2", "concatenate.cl" },
+ { "concatenate_width_x4", "concatenate.cl" },
{ "convolution_rectangle", "convolution_rectangle.cl" },
{ "col2im", "col2im.cl" },
{ "convert_depth_down", "depth_convert.cl" },
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index a232a94dfc..0e8805f9b6 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -25,13 +25,218 @@
#if defined(DATA_TYPE) && defined(VEC_SIZE)
+#if defined(DEPTH) && defined(ELEMENT_SIZE)
+
+#if defined(INPUT1_WIDTH)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not supported"
+#endif // ELEMENT_SIZE
+
+#if VEC_SIZE == 2
+#define SEQ ((int2)(0, 1))
+#elif VEC_SIZE == 4
+#define SEQ ((int4)(0, 1, 2, 3))
+#elif VEC_SIZE == 8
+#define SEQ ((int8)(0, 1, 2, 3, 4, 5, 6, 7))
+#elif VEC_SIZE == 16
+#define SEQ ((int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
+#else // VEC_SIZE
+#error "Vector size not supported"
+#endif // VEC_SIZE
+/** This kernel concatenates two input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ *
+ * @param[in] src1_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in] src1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] src2_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in] src2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void concatenate_width_x2(
+ TENSOR4D_DECLARATION(src1),
+ TENSOR4D_DECLARATION(src2),
+ TENSOR4D_DECLARATION(dst))
+{
+ Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+ // Calculate input indices
+ const int x = get_global_id(0) * (int)VEC_SIZE;
+ const int y = get_global_id(1);
+ const int z = get_global_id(2) % (int)DEPTH;
+ const int w = get_global_id(2) / (int)DEPTH;
+ const int x1 = min(x, (int)INPUT1_WIDTH);
+ const int x2 = max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE);
+
+ // Calculate inputs and output addresses
+ const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+ const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+
+ const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+ const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values = select(src2_values, src1_values, cond);
+
+ VSTORE(VEC_SIZE)
+ (values, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
+/** This kernel concatenates four input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
+ * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
+ *
+ * @param[in] src1_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in] src1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] src2_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in] src2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] src3_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in] src3_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src3_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src3_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src3_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src3_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src3_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src3_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src3_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src3_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] src4_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in] src4_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src4_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src4_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src4_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src4_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src4_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src4_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src4_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src4_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void concatenate_width_x4(
+ TENSOR4D_DECLARATION(src1),
+ TENSOR4D_DECLARATION(src2),
+ TENSOR4D_DECLARATION(src3),
+ TENSOR4D_DECLARATION(src4),
+ TENSOR4D_DECLARATION(dst))
+{
+ Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+ // Calculate input indices
+ const int x = get_global_id(0) * (int)VEC_SIZE;
+ const int y = get_global_id(1);
+ const int z = get_global_id(2) % (int)DEPTH;
+ const int w = get_global_id(2) / (int)DEPTH;
+
+ const int x1 = min(x, (int)INPUT1_WIDTH);
+ const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE), (int)INPUT2_WIDTH);
+ const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)VEC_SIZE), (int)INPUT3_WIDTH);
+ const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)VEC_SIZE);
+
+ // Calculate inputs and output addresses
+ const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+ const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+ const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
+ const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
+
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
+
+ const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+
+ const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in2 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+ const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in3 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+ const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in4 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = select(src2_values, src1_values, cond_in2);
+ values = select(src3_values, values, cond_in3);
+ values = select(src4_values, values, cond_in4);
+
+ VSTORE(VEC_SIZE)
+ (values, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
+#endif /* defined(INPUT1_WIDTH) */
+#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
+
#if defined(WIDTH_OFFSET) && defined(DEPTH)
/** This kernel concatenates the input tensor into the output tensor along the first dimension
*
* @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
* @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
* @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH16
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -53,7 +258,6 @@
* @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] offset The offset to the first valid element of the output tensor in bytes
*/
__kernel void concatenate_width(
TENSOR4D_DECLARATION(src),
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
new file mode 100644
index 0000000000..b0d27cbc87
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+ // The window needs to be based on the output
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
+ AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+ input2->dimension(1));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win_collapsed);
+}
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) > output->dimension(0));
+
+ for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
+
+ return Status{};
+}
+} // namespace
+
+CLWidthConcatenate2TensorsKernel::CLWidthConcatenate2TensorsKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void CLWidthConcatenate2TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+ build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x2", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ ICLKernel::configure_internal(std::get<1>(win_config));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "concatenate_width_x2_";
+ _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input1->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input1->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input2->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input2->info()->dimension(1));
+}
+
+void CLWidthConcatenate2TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_4D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input1, slice);
+ add_4D_tensor_argument(idx, _input2, slice);
+ add_4D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, window, lws_hint());
+ }
+ while(window.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
new file mode 100644
index 0000000000..75aef9cce0
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output)
+{
+ // The window needs to be based on the output
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
+ AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+ input2->dimension(1));
+ AccessWindowStatic input3_access(input3, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input3->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+ input3->dimension(1));
+ AccessWindowStatic input4_access(input4, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input4->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+ input4->dimension(1));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input1_access, input2_access, input3_access, input4_access, output_access);
+
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win_collapsed);
+}
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, input3, input4, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) + input3->dimension(0) + input4->dimension(0) > output->dimension(0));
+
+ for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(input3->dimension(i) != output->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(input4->dimension(i) != output->dimension(i));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
+
+ return Status{};
+}
+} // namespace
+
+CLWidthConcatenate4TensorsKernel::CLWidthConcatenate4TensorsKernel()
+ : _input1(nullptr), _input2(nullptr), _input3(nullptr), _input4(nullptr), _output(nullptr)
+{
+}
+
+Status CLWidthConcatenate4TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, input3, input4, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), input3->clone().get(), input4->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void CLWidthConcatenate4TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), input3->info(), input4->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _input3 = input3;
+ _input4 = input4;
+ _output = output;
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+ build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
+ build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->info()->dimension(0)));
+ build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0)));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x4", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input1->info(), input2->info(), input3->info(), input4->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ ICLKernel::configure_internal(std::get<1>(win_config));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "concatenate_width_x4_";
+ _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input1->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input1->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input2->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input2->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input3->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input3->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input4->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input4->info()->dimension(1));
+}
+
+void CLWidthConcatenate4TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_4D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input1, slice);
+ add_4D_tensor_argument(idx, _input2, slice);
+ add_4D_tensor_argument(idx, _input3, slice);
+ add_4D_tensor_argument(idx, _input4, slice);
+ add_4D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, window, lws_hint());
+ }
+ while(window.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 5233ff4f52..46a2d80d10 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -36,26 +36,46 @@ using namespace arm_compute;
CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
: _concat_kernels_vector(),
+ _concat_x2_kernel(),
+ _concat_x4_kernel(),
_num_inputs(0)
{
}
Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
{
+ const unsigned int num_inputs = inputs_vector.size();
+
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
- unsigned int width_offset = 0;
- for(const auto &input : inputs_vector)
+ switch(num_inputs)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
- width_offset += input->dimension(0);
+ case 2:
+ // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], &tmp_output_info));
+ break;
+ case 4:
+ // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], &tmp_output_info));
+ break;
+ default:
+ unsigned int width_offset = 0;
+ // Validate generic case of WidthConcatenate kernel
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+ width_offset += input->dimension(0);
+ }
+ break;
}
return Status{};
@@ -74,16 +94,30 @@ void CLWidthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector,
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
- ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
- unsigned int width_offset = 0;
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+ ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
- for(unsigned int i = 0; i < _num_inputs; i++)
+ switch(_num_inputs)
{
- _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
- width_offset += inputs_vector.at(i)->info()->dimension(0);
+ case 2:
+ // Configure WidthConcatenate2Tensors kernel
+ _concat_x2_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), output);
+ break;
+ case 4:
+ // Configure WidthConcatenate4Tensors kernel
+ _concat_x4_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+ break;
+ default:
+ // Configure generic case WidthConcatenate kernels
+ _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+
+ unsigned int width_offset = 0;
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+ width_offset += inputs_vector.at(i)->info()->dimension(0);
+ }
+ break;
}
}
@@ -91,8 +125,19 @@ void CLWidthConcatenateLayer::run()
{
cl::CommandQueue q = CLScheduler::get().queue();
- for(unsigned i = 0; i < _num_inputs; i++)
+ switch(_num_inputs)
{
- CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+ case 2:
+ CLScheduler::get().enqueue(_concat_x2_kernel, true);
+ break;
+ case 4:
+ CLScheduler::get().enqueue(_concat_x4_kernel, true);
+ break;
+ default:
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+ }
+ break;
}
}
diff --git a/tests/validation/fixtures/WidthConcatenateLayerFixture.h b/tests/validation/fixtures/WidthConcatenateLayerFixture.h
index caad0feee0..1f79210350 100644
--- a/tests/validation/fixtures/WidthConcatenateLayerFixture.h
+++ b/tests/validation/fixtures/WidthConcatenateLayerFixture.h
@@ -52,7 +52,7 @@ public:
{
// Create input shapes
std::mt19937 gen(library->seed());
- std::uniform_int_distribution<> num_dis(2, 4);
+ std::uniform_int_distribution<> num_dis(2, 8);
const int num_tensors = num_dis(gen);
std::vector<TensorShape> shapes(num_tensors, shape);