diff options
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/CL/CLKernelLibrary.cpp | 6 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/slice_ops.cl (renamed from src/core/CL/cl_kernels/strided_slice.cl) | 46 | ||||
-rw-r--r-- | src/core/CL/kernels/CLStridedSliceKernel.cpp | 21 | ||||
-rw-r--r-- | src/core/utils/helpers/tensor_transform.cpp | 24 |
4 files changed, 79 insertions, 18 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 29fd672a96..0cc6e320bf 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -361,7 +361,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = { "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" }, { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" }, { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" }, - { "strided_slice", "strided_slice.cl" }, + { "strided_slice", "slice_ops.cl" }, { "suppress_non_maximum", "canny.cl" }, { "tablelookup_U8", "tablelookup.cl" }, { "tablelookup_S16", "tablelookup.cl" }, @@ -742,8 +742,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = #include "./cl_kernels/softmax_layer_quantized.clembed" }, { - "strided_slice.cl", -#include "./cl_kernels/strided_slice.clembed" + "slice_ops.cl", +#include "./cl_kernels/slice_ops.clembed" }, { "tablelookup.cl", diff --git a/src/core/CL/cl_kernels/strided_slice.cl b/src/core/CL/cl_kernels/slice_ops.cl index 7c68fb9a07..bc3df47345 100644 --- a/src/core/CL/cl_kernels/strided_slice.cl +++ b/src/core/CL/cl_kernels/slice_ops.cl @@ -61,25 +61,47 @@ __kernel void strided_slice( Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH); Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH); - int offset_0 = 0; - int offset_1 = 0; - int offset_2 = 0; - int offset_3 = 0; + int offset = 0; - // Calculate offset -#if defined(START_0) && defined(STRIDE_0) - offset_0 = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0; + // Offset X +#if defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + offset = (int)START_0 + min(xi, (int)LAST_ACCESSED_X); + input.ptr += offset * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; +#elif defined(START_0) && defined(STRIDE_0) + offset = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0; + input.ptr += offset * input_stride_x; #endif // defined(START_0) && defined(STRIDE_0) + + // Offset Y #if defined(START_1) && defined(STRIDE_1) - offset_1 = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1; + offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1; + input.ptr += offset * input_stride_y; #endif // defined(START_1) && defined(STRIDE_1) + + // Offset Z #if defined(START_2) && defined(STRIDE_2) - offset_2 = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2; + offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2; + input.ptr += offset * input_stride_z; #endif // defined(START_2) && defined(STRIDE_2) + + // Offset depth #if defined(START_3) && defined(STRIDE_3) - offset_3 = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3; -#endif // defined(START_2) && defined(STRIDE_2) + offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3; + input.ptr += offset * input_stride_w; +#endif // defined(START_3) && defined(STRIDE_3) // Store result - *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)tensor4D_offset(&input, offset_0, offset_1, offset_2, offset_3)); +#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input.ptr)); + + VSTORE(VEC_SIZE) + (val, 0, (__global DATA_TYPE *)(output.ptr)); +#else // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr)); +#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) } diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp index f07436ac60..2d2ba103e5 100644 --- a/src/core/CL/kernels/CLStridedSliceKernel.cpp +++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp @@ -55,10 +55,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions()); - for(unsigned int i = 0; i < strides.num_dimensions(); ++i) + ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { - ARM_COMPUTE_RETURN_ERROR_ON(strides[i] == 0); - } + return i == 0; + })); // Get expected output shape const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, @@ -120,6 +120,19 @@ void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, // Configure kernel window auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + // Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size + const int vec_size_x = 16 / input->info()->element_size(); + const int output_width_x = output->info()->tensor_shape().x(); + const bool multi_access_x = (final_strides.x() == 1) && (output_width_x / vec_size_x > 0); + + // Update window if needed + if(multi_access_x) + { + Window &updated_window = std::get<1>(win_config); + updated_window.set(Window::DimX, + Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x)); + } ICLKernel::configure_internal(win_config.second); // Create build options @@ -130,6 +143,8 @@ void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i])); build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i])); } + build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option_if_else(input_shape.num_dimensions() > 2, "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), "-DSRC_DEPTH=1"); diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp index 5c83a8bdb5..a4bce5da5a 100644 --- a/src/core/utils/helpers/tensor_transform.cpp +++ b/src/core/utils/helpers/tensor_transform.cpp @@ -29,6 +29,30 @@ namespace helpers { namespace tensor_transform { +Coordinates slice_absolute_end_coords(TensorShape input_shape, Coordinates ends) +{ + // Create end mask + int32_t end_mask = 0; + for(unsigned int i = 0; i < ends.num_dimensions(); ++i) + { + if(ends[i] < 0) + { + end_mask |= 1 << i; + } + } + // Get unit strides + const BiStrides unit_strides = strided_slice_strides(input_shape, BiStrides()); + + return strided_slice_absolute_end_coords(input_shape, Coordinates(), ends, unit_strides, end_mask); +} + +TensorShape compute_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends_abs) +{ + // Get unit strides + const BiStrides unit_strides = strided_slice_strides(input_shape, BiStrides()); + return compute_strided_slice_output_shape(input_shape, starts, ends_abs, unit_strides); +} + Coordinates strided_slice_absolute_start_coords(TensorShape input_shape, Coordinates starts, Coordinates strides, int32_t begin_mask) { Coordinates starts_abs; |