aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2018-09-13 11:51:56 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commit8bab0ee5f44a0e2cfe60d4d9e040a2f5ae4ef9b1 (patch)
tree7659331b8f28ef85d0f59a8cca22384b25006e58
parent651540f5fc0529589867fc834f8c206c7e7a60c1 (diff)
downloadComputeLibrary-8bab0ee5f44a0e2cfe60d4d9e040a2f5ae4ef9b1.tar.gz
COMPMID-1584 - Collapse batch size in CLChannelShuffleLayerKernel
COMPMID-1589 - Add support for NHWC to CLChannelShuffleLayerKernel Change-Id: I13936a5cd1659d01fdb10b346e90f0d72d79f1f1 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/148475 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
-rw-r--r--src/core/CL/CLKernelLibrary.cpp1
-rw-r--r--src/core/CL/cl_kernels/channel_shuffle.cl201
-rw-r--r--src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp61
-rw-r--r--tests/validation/CL/ChannelShuffle.cpp36
-rw-r--r--tests/validation/fixtures/ChannelShuffleLayerFixture.h16
5 files changed, 203 insertions, 112 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index ef3a431f1a..fa164542e4 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -167,6 +167,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "channel_combine_UYVY422", "channel_combine.cl" },
{ "channel_combine_YUYV422", "channel_combine.cl" },
{ "channel_shuffle_nchw", "channel_shuffle.cl" },
+ { "channel_shuffle_nhwc", "channel_shuffle.cl" },
{ "channel_extract_NV12", "channel_extract.cl" },
{ "channel_extract_NV21", "channel_extract.cl" },
{ "channel_extract_RGB888", "channel_extract.cl" },
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
index 23962e1c2e..3ac67c58ae 100644
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ b/src/core/CL/cl_kernels/channel_shuffle.cl
@@ -23,19 +23,28 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
-// Check valid BLOCK_SIZES
-#if BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16
-#error "Only block sizes 4, 8 and 16 are supported"
-#endif /* BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16 */
+// Check valid VEC_SIZES
+#if VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-/** Perfoms channel shuffle see https://arxiv.org/pdf/1707.01083.pdf for details.
+#define DIV_MOD_UINT(x, y, div_res, mod_res) \
+ ({ \
+ div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+ uint r = div_res * (y); \
+ mod_res = (x)-r; \
+ })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
*
- * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group should be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
* K is equal to num_channels / num_groups.
*
* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -45,6 +54,8 @@
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
@@ -53,80 +64,118 @@
* @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void channel_shuffle_nchw(TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst))
{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+ uint curr_channel = 0; // channel id of input
+ uint batch_id = 0; // batch id
+ uint group_id = 0; // group id
+ uint channel_id = 0; // channel id within the group
+
+ // Compute curr_channel and batch_id
+ DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
- const uint curr_channel = get_global_id(2); // channel id of input
- const uint group_id = curr_channel / K; // group id
- const uint channel_id = curr_channel % K; // channel id within the group
+ // Compute group_id and channel_id
+ DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
- const uint x = get_global_id(0) * BLOCK_SIZE;
- const uint y = get_global_id(1) * BLOCK_SIZE;
+ const uint x = get_global_id(0) * VEC_SIZE;
+ const uint y = get_global_id(1) * 2;
const uint z = channel_id * NUM_GROUPS + group_id;
- // Load the NxN block
- TYPE u0 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 0, 0));
- TYPE u1 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 1, 0));
- TYPE u2 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 2, 0));
- TYPE u3 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 3, 0));
-#if BLOCK_SIZE > 4
- TYPE u4 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 4, 0));
- TYPE u5 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 5, 0));
- TYPE u6 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 6, 0));
- TYPE u7 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 7, 0));
-#if BLOCK_SIZE == 16
- TYPE u8 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 8, 0));
- TYPE u9 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 9, 0));
- TYPE u10 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 10, 0));
- TYPE u11 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 11, 0));
- TYPE u12 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 12, 0));
- TYPE u13 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 13, 0));
- TYPE u14 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 14, 0));
- TYPE u15 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 15, 0));
-#endif /* BLOCK_SIZE == 16 */
-#endif /* BLOCK_SIZE > 4 */
+ // Load the Nx2 block
+ const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+ TYPE u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+ TYPE u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+ // Store blocks
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+ VSTORE(VEC_SIZE)
+ (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+ VSTORE(VEC_SIZE)
+ (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
+}
+
+#if VEC_SIZE == 4 && defined(LAST_ACCESSED)
+/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note This implementation is only defined for VEC_SIZE = 4
+ * @note This last element accessed along the first dimension must be given as a preprocessor argument using -DLAST_ACCESSED=num. e.g. -DLAST_ACCESSED=64 in order to prevent out-of-bound writes.
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The height of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ * K is equal to num_channels / num_groups.
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst))
+{
+ const uint curr_channel = min((uint)(get_global_id(0) * VEC_SIZE), (uint)LAST_ACCESSED); // input feature map
+ uint channel_id0 = 0;
+ uint channel_id1 = 0;
+ uint channel_id2 = 0;
+ uint channel_id3 = 0;
+ uint group_id0 = 0;
+ uint group_id1 = 0;
+ uint group_id2 = 0;
+ uint group_id3 = 0;
+ uint y = 0;
+ uint batch_id = 0;
+
+ // Compute curr_channel and batch_id
+ DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, y);
+
+ // Compute group_id and channel_id
+ DIV_MOD_UINT(curr_channel + (uint)0, K, group_id0, channel_id0);
+ DIV_MOD_UINT(curr_channel + (uint)1, K, group_id1, channel_id1);
+ DIV_MOD_UINT(curr_channel + (uint)2, K, group_id2, channel_id2);
+ DIV_MOD_UINT(curr_channel + (uint)3, K, group_id3, channel_id3);
+
+ const uint x = get_global_id(1) * 2;
+ const uint z0 = channel_id0 * (uint)NUM_GROUPS + group_id0;
+ const uint z1 = channel_id1 * (uint)NUM_GROUPS + group_id1;
+ const uint z2 = channel_id2 * (uint)NUM_GROUPS + group_id2;
+ const uint z3 = channel_id3 * (uint)NUM_GROUPS + group_id3;
+
+ // Load the Nx2 block
+ const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + curr_channel * sizeof(DATA_TYPE) + x * src_stride_y + y * src_stride_z + batch_id * src_stride_w;
+ TYPE u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+ TYPE u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
// Store blocks
- VSTORE(BLOCK_SIZE)
- (u0, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 0, z));
- VSTORE(BLOCK_SIZE)
- (u1, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 1, z));
- VSTORE(BLOCK_SIZE)
- (u2, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 2, z));
- VSTORE(BLOCK_SIZE)
- (u3, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 3, z));
-#if BLOCK_SIZE > 4
- VSTORE(BLOCK_SIZE)
- (u4, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 4, z));
- VSTORE(BLOCK_SIZE)
- (u5, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 5, z));
- VSTORE(BLOCK_SIZE)
- (u6, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 6, z));
- VSTORE(BLOCK_SIZE)
- (u7, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 7, z));
-#if BLOCK_SIZE == 16
- VSTORE(BLOCK_SIZE)
- (u8, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 8, z));
- VSTORE(BLOCK_SIZE)
- (u9, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 9, z));
- VSTORE(BLOCK_SIZE)
- (u10, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 10, z));
- VSTORE(BLOCK_SIZE)
- (u11, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 11, z));
- VSTORE(BLOCK_SIZE)
- (u12, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 12, z));
- VSTORE(BLOCK_SIZE)
- (u13, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 13, z));
- VSTORE(BLOCK_SIZE)
- (u14, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 14, z));
- VSTORE(BLOCK_SIZE)
- (u15, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 15, z));
-#endif /* BLOCK_SIZE == 16 */
-#endif /* BLOCK_SIZE > 4 */
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_stride_y + y * dst_stride_z + batch_id * dst_stride_w;
+ *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u0.s0;
+ *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u0.s1;
+ *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u0.s2;
+ *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u0.s3;
+ *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u1.s0;
+ *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u1.s1;
+ *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u1.s2;
+ *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u1.s3;
}
-#endif /* defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K) */
+#endif // VEC_SIZE == 4 && defined(LAST_ACCESSED)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index be4d68770d..53a54564d6 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -67,18 +67,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, *input->clone());
- const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ const unsigned int num_elems_processed_per_iteration_x = is_nhwc ? 4 : max_cl_vector_width / input->element_size();
+ constexpr unsigned int num_elems_processed_per_iteration_y = 2;
// Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
const bool window_changed = update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, input->valid_region());
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(err, win_collapsed);
}
} // namespace
@@ -96,14 +100,19 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
- const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL));
- const unsigned int block_size = max_cl_vector_width / input->info()->element_size();
+ const DataLayout data_layout = input->info()->data_layout();
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ const unsigned int vec_size = is_nhwc ? 4 : max_cl_vector_width / input->info()->element_size();
// Set kernel build options
CLBuildOptions build_opts;
build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
- build_opts.add_option("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(channels - vec_size), 0)));
+
switch(input->info()->element_size())
{
case 1:
@@ -120,12 +129,33 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
}
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("channel_shuffle_nchw", build_opts.options()));
+ std::string kernel_name = "channel_shuffle_" + lower_string(string_from_data_layout(data_layout));
+ ;
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(num_groups);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
}
Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
@@ -141,14 +171,9 @@ void CLChannelShuffleLayerKernel::run(const Window &window, cl::CommandQueue &qu
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_3D();
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
- }
- while(window.slide_window_slice_3D(slice));
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, window);
+ add_4D_tensor_argument(idx, _output, window);
+ enqueue(queue, *this, window, lws_hint());
}
} // namespace arm_compute
diff --git a/tests/validation/CL/ChannelShuffle.cpp b/tests/validation/CL/ChannelShuffle.cpp
index 41813c4ffd..c2373ff895 100644
--- a/tests/validation/CL/ChannelShuffle.cpp
+++ b/tests/validation/CL/ChannelShuffle.cpp
@@ -63,15 +63,17 @@ template <typename T>
using CLChannelShuffleLayerFixture = ChannelShuffleLayerValidationFixture<CLTensor, CLAccessor, CLChannelShuffleLayer, T>;
TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelShuffleLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallRandomChannelShuffleLayerDataset(),
- framework::dataset::make("DataType",
- DataType::U8)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelShuffleLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallRandomChannelShuffleLayerDataset(),
+ framework::dataset::make("DataType", DataType::U8)),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
{
// Validate output
validate(CLAccessor(_target), _reference);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelShuffleLayerFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType",
- DataType::U8)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelShuffleLayerFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeRandomChannelShuffleLayerDataset(),
+ framework::dataset::make("DataType",
+ DataType::U8)),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
{
// Validate output
validate(CLAccessor(_target), _reference);
@@ -80,14 +82,18 @@ TEST_SUITE_END()
TEST_SUITE(Float)
TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelShuffleLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType",
- DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelShuffleLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallRandomChannelShuffleLayerDataset(),
+ framework::dataset::make("DataType",
+ DataType::F16)),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
{
// Validate output
validate(CLAccessor(_target), _reference);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelShuffleLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType",
- DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelShuffleLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeRandomChannelShuffleLayerDataset(),
+ framework::dataset::make("DataType",
+ DataType::F16)),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
{
// Validate output
validate(CLAccessor(_target), _reference);
@@ -95,14 +101,18 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelShuffleLayerFixture<half>, framework::
TEST_SUITE_END()
TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelShuffleLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType",
- DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelShuffleLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallRandomChannelShuffleLayerDataset(),
+ framework::dataset::make("DataType",
+ DataType::F32)),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
{
// Validate output
validate(CLAccessor(_target), _reference);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelShuffleLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType",
- DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelShuffleLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeRandomChannelShuffleLayerDataset(),
+ framework::dataset::make("DataType",
+ DataType::F32)),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
{
// Validate output
validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/fixtures/ChannelShuffleLayerFixture.h b/tests/validation/fixtures/ChannelShuffleLayerFixture.h
index 9746480829..c9aae2dc17 100644
--- a/tests/validation/fixtures/ChannelShuffleLayerFixture.h
+++ b/tests/validation/fixtures/ChannelShuffleLayerFixture.h
@@ -46,9 +46,9 @@ class ChannelShuffleLayerValidationFixture : public framework::Fixture
{
public:
template <typename...>
- void setup(TensorShape shape, unsigned int num_groups, DataType data_type)
+ void setup(TensorShape shape, unsigned int num_groups, DataType data_type, DataLayout data_layout)
{
- _target = compute_target(shape, data_type, num_groups);
+ _target = compute_target(shape, data_type, num_groups, data_layout);
_reference = compute_reference(shape, data_type, num_groups);
}
@@ -59,11 +59,17 @@ protected:
library->fill_tensor_uniform(tensor, 0);
}
- TensorType compute_target(const TensorShape &shape, DataType data_type, unsigned int num_groups)
+ TensorType compute_target(TensorShape shape, DataType data_type, unsigned int num_groups, DataLayout data_layout)
{
+ // Note: The input shape passed to the function is always in NCHW
+ if(data_layout == DataLayout::NHWC)
+ {
+ permute(shape, PermutationVector(2U, 0U, 1U));
+ }
+
// Create tensors
- TensorType src = create_tensor<TensorType>(shape, data_type);
- TensorType dst = create_tensor<TensorType>(shape, data_type);
+ TensorType src = create_tensor<TensorType>(shape, data_type, 1, QuantizationInfo(), data_layout);
+ TensorType dst;
// Create and configure function
FunctionType channel_shuffle_func;