diff options
Diffstat (limited to 'src/core/CL/kernels')
-rw-r--r-- | src/core/CL/kernels/CLActivationLayerKernel.cpp | 4 | ||||
-rw-r--r-- | src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp | 11 | ||||
-rw-r--r-- | src/core/CL/kernels/CLPoolingLayerKernel.cpp | 100 |
3 files changed, 64 insertions, 51 deletions
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp index 5bfc832518..adedebba53 100644 --- a/src/core/CL/kernels/CLActivationLayerKernel.cpp +++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp @@ -101,7 +101,7 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int))); // Set scale and offset of the input and output - if(is_data_type_quantized_assymetric(dt)) + if(is_data_type_quantized_asymmetric(dt)) { float s1 = input->info()->quantization_info().scale; int o1 = input->info()->quantization_info().offset; @@ -127,7 +127,7 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act } // Create kernel - std::string kernel_name = is_data_type_quantized_assymetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer"); + std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer"); _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); // Make sure _kernel is initialized before calling the parent's configure diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp index 53e46390c1..5f109f76af 100644 --- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp +++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp @@ -84,7 +84,12 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL output_shape.set(2, weights->info()->dimension(3)); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + auto_init_if_empty(*output->info(), + output_shape, + 1, + input->info()->data_type(), + input->info()->fixed_point_position(), + input->info()->quantization_info()); ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -176,7 +181,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL else { bool is_quantized_fixed_point = is_data_type_fixed_point(data_type); - bool is_quantized_asymm = is_data_type_quantized_assymetric(data_type); + bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type); DataType promoted_type = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type; build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size))); @@ -220,7 +225,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL } // Set static kernel arguments - if(is_data_type_quantized_assymetric(data_type)) + if(is_data_type_quantized_asymmetric(data_type)) { int output_multiplier = 0; int output_shift = 0; diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp index 2854cd8265..1317278fb5 100644 --- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLKernel.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" @@ -80,7 +81,12 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, output_shape.set(0, pooled_w); output_shape.set(1, pooled_h); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + auto_init_if_empty(*output->info(), + output_shape, + 1, + input->info()->data_type(), + input->info()->fixed_point_position(), + input->info()->quantization_info()); } ARM_COMPUTE_ERROR_THROW_ON(CLPoolingLayerKernel::validate(input->info(), output->info(), pool_info)); @@ -94,80 +100,80 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, _pool_info = pool_info; _border_size = BorderSize(pool_pad_y, pool_pad_x); - // Set build options - std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace(("-DPOOL_" + string_from_pooling_type(pool_type))); - if(is_data_type_fixed_point(input->info()->data_type())) - { - build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())); - } + const GPUTarget gpu_target = get_arch_from_target(get_target()); + const DataType data_type = input->info()->data_type(); - build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x))); + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type)); + build_opts.add_option_if(is_data_type_fixed_point(data_type), + "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())); + build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)); if(pool_type != PoolingType::MAX) { - if(exclude_padding) - { - build_opts.emplace("-DEXCLUDE_PADDING"); - } - build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x)))); - build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y)))); - build_opts.emplace(("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y))); - build_opts.emplace(("-DPAD_X=" + support::cpp11::to_string(pool_pad_x))); - build_opts.emplace(("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y))); + build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); + build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x))); + build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y))); + build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)); + build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_x)); + build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y)); } // Create kernel - if((pool_size == 2) || (pool_size == 3) || (pool_size == 7)) + if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type)) { // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where // each thread computes 4 output elements - const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type()); + const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type); - int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size; + int num_elems_read_per_iteration = pool_size; if(is_pool3x3_stride_le3) { - // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3 + // Change the number of elements processed and the number of elements read per iteration + // for pooling 3x3 with stride less equal than 3 _num_elems_processed_per_iteration = 4; - num_elements_read_per_iteration = pool_size * (pool_stride_x + 1); + num_elems_read_per_iteration = pool_size * (pool_stride_x + 1); } - const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width; + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width; const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; _border_size.right = std::max(upper_bound_w, pool_pad_x); _border_size.bottom = std::max(upper_bound_h, pool_pad_y); - std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size); - if(is_pool3x3_stride_le3) - { - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts)); - } - else - { - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); - } + std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_") + + support::cpp11::to_string(pool_size); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); } else // Run general case { - _num_elems_processed_per_iteration = 1; - const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width; const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; _border_size.right = std::max(upper_bound_w, pool_pad_x); _border_size.bottom = std::max(upper_bound_h, pool_pad_y); - build_opts.emplace(("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size))); - if(input->info()->data_type() == DataType::F16) - { - build_opts.emplace("-DFP16"); - } - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("pooling_layer_N", build_opts)); + build_opts.add_option("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size)); + build_opts.add_option_if(data_type == DataType::F16, "-DFP16"); + + std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_N_quantized" : "pooling_layer_N"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); } // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration)); + Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration)); + + // Configure the local work size (hint) from the first two dimensions of the global work size. + // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized + // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is + // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with). + if(gpu_target == GPUTarget::BIFROST) + { + cl::NDRange gws = ICLKernel::gws_from_window(win); + _lws_hint = cl::NDRange(gws[0], gws[1], 1); + } + AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration); update_window_and_padding(win, input_access, output_access); @@ -178,14 +184,16 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, Error CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2), + "Unsupported combination of parameters!"); int pool_pad_x = 0; int pool_pad_y = 0; int pool_size = pool_info.pool_size(); std::tie(pool_pad_x, pool_pad_y) = pool_info.pad_stride_info().pad(); ARM_COMPUTE_RETURN_ERROR_ON_MSG(((pool_pad_x >= pool_size) || (pool_pad_y >= pool_size)), - "Invalid pool size and pool pad combination"); + "Invalid pool size and pool pad combination!"); // Checks performed when output is configured if(output->total_size() != 0) @@ -230,7 +238,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _input, in_slice); add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); + enqueue(queue, *this, slice, _lws_hint); } while(window_collapsed.slide_window_slice_3D(slice)); } |