aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2018-09-13 11:51:56 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commit8bab0ee5f44a0e2cfe60d4d9e040a2f5ae4ef9b1 (patch)
tree7659331b8f28ef85d0f59a8cca22384b25006e58 /src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
parent651540f5fc0529589867fc834f8c206c7e7a60c1 (diff)
downloadComputeLibrary-8bab0ee5f44a0e2cfe60d4d9e040a2f5ae4ef9b1.tar.gz
COMPMID-1584 - Collapse batch size in CLChannelShuffleLayerKernel
COMPMID-1589 - Add support for NHWC to CLChannelShuffleLayerKernel Change-Id: I13936a5cd1659d01fdb10b346e90f0d72d79f1f1 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/148475 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp')
-rw-r--r--src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp61
1 files changed, 43 insertions, 18 deletions
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index be4d68770d..53a54564d6 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -67,18 +67,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, *input->clone());
- const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ const unsigned int num_elems_processed_per_iteration_x = is_nhwc ? 4 : max_cl_vector_width / input->element_size();
+ constexpr unsigned int num_elems_processed_per_iteration_y = 2;
// Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
const bool window_changed = update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, input->valid_region());
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(err, win_collapsed);
}
} // namespace
@@ -96,14 +100,19 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
- const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL));
- const unsigned int block_size = max_cl_vector_width / input->info()->element_size();
+ const DataLayout data_layout = input->info()->data_layout();
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ const unsigned int vec_size = is_nhwc ? 4 : max_cl_vector_width / input->info()->element_size();
// Set kernel build options
CLBuildOptions build_opts;
build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
- build_opts.add_option("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(channels - vec_size), 0)));
+
switch(input->info()->element_size())
{
case 1:
@@ -120,12 +129,33 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
}
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("channel_shuffle_nchw", build_opts.options()));
+ std::string kernel_name = "channel_shuffle_" + lower_string(string_from_data_layout(data_layout));
+ ;
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(num_groups);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
}
Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
@@ -141,14 +171,9 @@ void CLChannelShuffleLayerKernel::run(const Window &window, cl::CommandQueue &qu
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_3D();
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
- }
- while(window.slide_window_slice_3D(slice));
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, window);
+ add_4D_tensor_argument(idx, _output, window);
+ enqueue(queue, *this, window, lws_hint());
}
} // namespace arm_compute