diff options
Diffstat (limited to 'src/core/CL')
-rw-r--r-- | src/core/CL/cl_kernels/col2im.cl | 24 | ||||
-rw-r--r-- | src/core/CL/kernels/CLCol2ImKernel.cpp | 35 |
2 files changed, 37 insertions, 22 deletions
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl index 98bf8d1ed4..5e52127f27 100644 --- a/src/core/CL/cl_kernels/col2im.cl +++ b/src/core/CL/cl_kernels/col2im.cl @@ -41,12 +41,15 @@ * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320 * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600 * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4 + * @note In case of grouping the GROUPING flag must be passed at compile time using -DGROUPING * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) @@ -59,11 +62,14 @@ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) */ __kernel void col2im( - IMAGE_DECLARATION(src), + TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), uint dst_stride_w) { - Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + + const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor + const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor VEC_DATA_TYPE(DATA_TYPE, 8) data = vload8(0, (__global DATA_TYPE *)src.ptr); @@ -82,8 +88,16 @@ __kernel void col2im( __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes; - // Compute output offset - int idx = (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w; +#if defined(GROUPING) + // Compute output offset (batches on 4th dimension, no need to compute manually) + int idx = yd * dst_stride_y + xd * dst_stride_x; + + const uint group = get_global_id(2); // group ID + x_clamped += group * WIDTH_INPUT; +#else /* defined(GROUPING) */ + // Compute output offset (batches on 3rd dimension) + int idx = yd * dst_stride_y + xd * dst_stride_x + get_global_id(2) * dst_stride_w; +#endif /* GROUPING */ // Store value *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s0 * dst_stride_z)) = data.s0; @@ -95,4 +109,4 @@ __kernel void col2im( *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s6 * dst_stride_z)) = data.s6; *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s7 * dst_stride_z)) = data.s7; } -#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)
\ No newline at end of file +#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp index 6fd3be7f6a..d7582dc943 100644 --- a/src/core/CL/kernels/CLCol2ImKernel.cpp +++ b/src/core/CL/kernels/CLCol2ImKernel.cpp @@ -40,7 +40,7 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); @@ -49,19 +49,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, s // Checks performed when output is configured if(output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, num_groups)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW"); } return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims))); + auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, num_groups)).set_data_layout(DataLayout::NCHW)); const unsigned int num_elems_read_per_iteration = 8; @@ -86,12 +87,12 @@ CLCol2ImKernel::CLCol2ImKernel() { } -void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims) +void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims, num_groups)); _input = input; _output = output; @@ -105,11 +106,12 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size())); build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0))); build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first)); + build_opts.add_option_if(num_groups > 1, "-DGROUPING"); _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts.options())); // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info(), _convolved_dims); + auto win_config = validate_and_configure_window(input->info(), output->info(), _convolved_dims, num_groups); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -117,6 +119,7 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p _config_id = "col2im_"; _config_id += lower_string(string_from_data_type(input->info()->data_type())); _config_id += "_"; + _config_id += (num_groups > 1) ? "grouping_" : ""; _config_id += support::cpp11::to_string(input->info()->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(input->info()->dimension(1)); @@ -126,11 +129,11 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p _config_id += support::cpp11::to_string(output->info()->dimension(1)); } -Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims) +Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims).first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims, num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims, num_groups).first); return Status{}; } @@ -142,21 +145,19 @@ void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue) Window out_window; out_window.use_tensor_dimensions(_output->info()->tensor_shape()); - Window collapsed_window = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed_window.first_slice_window_2D(); - Window slice_out = out_window.first_slice_window_3D(); + Window slice = window.first_slice_window_3D(); + Window slice_out = out_window.first_slice_window_3D(); - // Set static kernel arguments - unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor(); + unsigned int idx = 2 * num_arguments_per_3D_tensor(); _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes()[3]); do { // Set inputs unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice_out); enqueue(queue, *this, slice, lws_hint()); } - while(collapsed_window.slide_window_slice_2D(slice) && out_window.slide_window_slice_3D(slice_out)); + while(window.slide_window_slice_3D(slice) && out_window.slide_window_slice_3D(slice_out)); } |