From bf17955e2bf36c635acbac7c3bb03fbbd7732671 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Tue, 5 Sep 2017 13:51:21 +0100 Subject: COMPMID-522 - Added support for GlobalPooling in CLPoolingLayer and CLFlattening for 3D tensor Change-Id: Ifc7db1e4d4af322a4dcbfeb3e132e5c326596872 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86618 Reviewed-by: Georgios Pinitas Tested-by: Kaizen --- src/core/CL/CLKernelLibrary.cpp | 1 + src/core/CL/cl_kernels/pooling_layer.cl | 98 ++++++++++++++++++++++++++++ src/core/CL/kernels/CLPoolingLayerKernel.cpp | 80 ++++++++++++++--------- 3 files changed, 149 insertions(+), 30 deletions(-) (limited to 'src/core') diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 6602b4dc78..4cd0a78a92 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -239,6 +239,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "pooling_layer_3", "pooling_layer.cl" }, { "pooling_layer_3_optimized", "pooling_layer.cl" }, { "pooling_layer_7", "pooling_layer.cl" }, + { "pooling_layer_N", "pooling_layer.cl" }, { "quantization_layer", "quantization_layer.cl" }, { "reduction_operation", "reduction_operation.cl" }, { "remap_nearest_neighbour", "remap.cl" }, diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl index 18ad4a69a8..0497bf4b91 100644 --- a/src/core/CL/cl_kernels/pooling_layer.cl +++ b/src/core/CL/cl_kernels/pooling_layer.cl @@ -415,3 +415,101 @@ __kernel void pooling_layer_7( // Store result *(__global DATA_TYPE *)output.ptr = res; } + +#if defined(POOL_SIZE) + +// Set the initial value for the pooling operation accordingly with the data type +#if defined(POOL_AVG) +#define INITIAL_VALUE 0 +#else // POOL_AVG +#ifdef FIXED_POINT_POSITION +#define MIN_VAL_EXPAND(type) type##_MIN +#define MIN_VAL(type) MIN_VAL_EXPAND(type) +#define INITIAL_VALUE MIN_VAL(DATA_TYPE) +#define INITIAL_VALUE 0 +#else // FIXED_POINT_POSITION +#if FP16 +#define INITIAL_VALUE -HALF_MAX +#else // FP16 +#define INITIAL_VALUE -FLT_MAX +#endif // FP16 +#endif // FIXED_POINT_POSITION + +#endif // POOL_AVG + +/** Performs a pooling function of pool size equal to N + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note -DFP16 must be passed at compile time if half float data type is used + * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void pooling_layer_N( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, 8) + vdata = INITIAL_VALUE; + DATA_TYPE sdata = INITIAL_VALUE; + + // Load data + for(int y = 0; y < POOL_SIZE; y++) + { + int x = 0; + for(; x <= ((int)POOL_SIZE - 8); x += 8) + { + VEC_DATA_TYPE(DATA_TYPE, 8) + data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); + vdata = POOL_OP(vdata, data0); + } + + // Leftover + for(; x < (int)POOL_SIZE; ++x) + { + DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); + sdata = POOL_OP(sdata, data0); + } + } + + // Reduce result + VEC_DATA_TYPE(DATA_TYPE, 4) + reduce4 = POOL_OP(vdata.s0123, vdata.s4567); + VEC_DATA_TYPE(DATA_TYPE, 2) + reduce2 = POOL_OP(reduce4.s01, reduce4.s23); + DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1); + res = POOL_OP(res, sdata); + + // Divide by pool region in case of average pooling +#ifdef POOL_AVG + res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); +#endif /* POOL_AVG */ + + // Store result + *(__global DATA_TYPE *)output.ptr = res; +} +#endif // defined(POOL_SIZE) \ No newline at end of file diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp index 22d29c66aa..22c7730963 100644 --- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -64,13 +64,10 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - static const std::set supported_pool_sizes = { 2, 3, 7 }; - ARM_COMPUTE_UNUSED(supported_pool_sizes); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()); ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size); + ARM_COMPUTE_ERROR_ON(pool_size > 7 && is_data_type_fixed_point(input->info()->data_type())); // Check output dimensions std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), @@ -92,29 +89,14 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h)); - // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where - // each thread computes 4 output elements - const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type()); - - int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size; - if(is_pool3x3_stride_le3) - { - // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3 - _num_elems_processed_per_iteration = 4; - num_elements_read_per_iteration = pool_size * (pool_stride_x + 1); - } - const int input_width = input->info()->dimension(0); - const int input_height = input->info()->dimension(1); - const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); // Set instance variables - _input = input; - _output = output; - _pool_info = pool_info; - _border_size = BorderSize(pool_pad_y, pool_pad_x); - _border_size.right = std::max(upper_bound_w, pool_pad_x); - _border_size.bottom = std::max(upper_bound_h, pool_pad_y); + _input = input; + _output = output; + _pool_info = pool_info; + _border_size = BorderSize(pool_pad_y, pool_pad_x); // Set build options std::set build_opts; @@ -136,14 +118,52 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, } // Create kernel - std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size); - if(is_pool3x3_stride_le3) + if(pool_size <= 7) { - _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts)); + // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where + // each thread computes 4 output elements + const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type()); + + int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size; + if(is_pool3x3_stride_le3) + { + // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3 + _num_elems_processed_per_iteration = 4; + num_elements_read_per_iteration = pool_size * (pool_stride_x + 1); + } + + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + + _border_size.right = std::max(upper_bound_w, pool_pad_x); + _border_size.bottom = std::max(upper_bound_h, pool_pad_y); + + std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size); + if(is_pool3x3_stride_le3) + { + _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts)); + } + else + { + _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + } } - else + else // Run general case { - _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + _num_elems_processed_per_iteration = 1; + + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + + _border_size.right = std::max(upper_bound_w, pool_pad_x); + _border_size.bottom = std::max(upper_bound_h, pool_pad_y); + + build_opts.emplace(("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size))); + if(input->info()->data_type() == DataType::F16) + { + build_opts.emplace("-DFP16"); + } + _kernel = static_cast(CLKernelLibrary::get().create_kernel("pooling_layer_N", build_opts)); } // Configure kernel window -- cgit v1.2.1