From b4bb6a03f717a320b935809fde795b3d6ec5a69f Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Mon, 24 May 2021 16:01:32 +0100 Subject: Rename ported functions Rename CpuPooling to CpuPool2d Rename CpuPoolingKernel to CpuPool2dKernel Rename CpuPoolingAssemblyWrapperKernel to CpuPool2dAssemblyWrapperKernel Move CpuPool2dAssemblyWrapperKernel in internal subfolder Rename CpuDepthwiseConvolutionNativeKernel to CpuDepthwiseConv2dNativeKernel Rename CpuDepthwiseConvolutionAssemblyDispatch to CpuDepthwiseConv2dAssemblyDispatch Rename CpuDepthwiseConvolution to CpuDepthwiseConv2d Rename CpuDirectConvolutionKernel to CpuDirectConv2dKernel Rename CpuDirectConvolutionOutputStageKernel to CpuDirectConv2dOutputStageKernel Rename CpuDirectConvolution to CpuDirectConv2d Rename ClPoolingKernel to ClPool2dKernel Rename ClPooling to ClPool2d Rename ClDirectConvolutionKernel to ClDirectConv2dKernel Resolves: COMPMID-4405 Change-Id: I8e48f015e4e492a76a7512f5679cb3eb0cd028f6 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5708 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp | 665 +++++++++++++++++++++ src/core/gpu/cl/kernels/ClDirectConv2dKernel.h | 87 +++ .../gpu/cl/kernels/ClDirectConvolutionKernel.cpp | 665 --------------------- .../gpu/cl/kernels/ClDirectConvolutionKernel.h | 97 --- src/core/gpu/cl/kernels/ClPool2dKernel.cpp | 509 ++++++++++++++++ src/core/gpu/cl/kernels/ClPool2dKernel.h | 76 +++ src/core/gpu/cl/kernels/ClPoolingKernel.cpp | 509 ---------------- src/core/gpu/cl/kernels/ClPoolingKernel.h | 79 --- 8 files changed, 1337 insertions(+), 1350 deletions(-) create mode 100644 src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClDirectConv2dKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h create mode 100644 src/core/gpu/cl/kernels/ClPool2dKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClPool2dKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClPoolingKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClPoolingKernel.h (limited to 'src/core/gpu/cl') diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp new file mode 100644 index 0000000000..2c9a4f301b --- /dev/null +++ b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLUtils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + + const DataLayout data_layout = src->data_layout(); + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) + && std::get<0>(conv_info.stride()) > 2, + "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution."); + + if(data_layout == DataLayout::NCHW) + { + if(is_data_type_quantized(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9, + "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5, + "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types"); + } + } + + if(biases != nullptr) + { + if(is_data_type_quantized_asymmetric(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3), + "Biases size and number of src feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, + "Biases should be one dimensional"); + } + + // Checks performed when dst is configured + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + const auto data_type = src->data_type(); + if(is_data_type_quantized(data_type)) + { + const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); + + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + } + return Status{}; +} + +inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size, + DataType data_type, DataLayout data_layout) +{ + return gpu_target_is_in(gpu_target, + GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, + GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, + GPUTarget::G52, GPUTarget::G52LIT) + && (kernel_size <= 5) + && (conv_stride_x == 1) && (conv_stride_y == 1) + && (data_type == DataType::F32) + && (data_layout == DataLayout::NCHW); +} + +inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y, + unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y, + unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *src) +{ + const DataType data_type = src->data_type(); + const DataLayout data_layout = src->data_layout(); + unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + + const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout); + + if(run_optimized_bifrost) + { + // Configure kernel window + switch(kernel_size) + { + case 1: + { + num_elems_read_per_iteration_x = 4; + num_elems_read_per_iteration_y = 4; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 4; + break; + } + case 3: + { + num_elems_read_per_iteration_x = 6; + num_elems_read_per_iteration_y = 5; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 3; + break; + } + case 5: + { + num_elems_read_per_iteration_x = 8; + num_elems_read_per_iteration_y = 6; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 2; + break; + } + default: + { + ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost"); + } + } + } + else + { + num_elems_read_per_iteration_y = kernel_size; + num_elems_written_per_iteration_x = 8; + num_elems_written_per_iteration_y = 1; + switch(kernel_size) + { + case 1: + switch(conv_stride_x) + { + case 1: + num_elems_read_per_iteration_x = 8; + break; + case 2: + num_elems_read_per_iteration_x = 16; + break; + case 3: + switch(src->element_size()) + { + case 1: + num_elems_read_per_iteration_x = 28; + break; + case 2: + num_elems_read_per_iteration_x = 24; + break; + case 4: + num_elems_read_per_iteration_x = 22; + break; + default: + ARM_COMPUTE_ERROR("Invalid data size"); + } + break; + default: + ARM_COMPUTE_ERROR("Invalid convolution stride X"); + } + break; + case 3: + switch(conv_stride_x) + { + case 1: + num_elems_read_per_iteration_x = 10; + break; + case 2: + num_elems_read_per_iteration_x = 17; + break; + default: + ARM_COMPUTE_ERROR("Invalid convolution stride X"); + } + break; + case 5: + switch(conv_stride_x) + { + case 1: + num_elems_read_per_iteration_x = 12; + break; + case 2: + num_elems_read_per_iteration_x = 20; + break; + default: + ARM_COMPUTE_ERROR("Invalid convolution stride X"); + } + break; + case 9: + switch(conv_stride_x) + { + case 1: + num_elems_read_per_iteration_x = 16; + break; + case 2: + num_elems_read_per_iteration_x = 24; + break; + default: + ARM_COMPUTE_ERROR("Invalid convolution stride X"); + } + break; + default: + ARM_COMPUTE_ERROR("Invalid direct convolution size"); + } + } +} + +std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target) +{ + const DataLayout data_layout = src->data_layout(); + + // Get dst shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, output_shape, + 1, + src->data_type(), + src->quantization_info()); + + if(data_layout == DataLayout::NHWC) + { + const unsigned int vec_size = std::min(static_cast(dst->tensor_shape()[0]), 4u); + unsigned int num_rows = 1U; + if(dst->tensor_shape()[0] > 16) + { + num_rows = src->data_type() == DataType::F32 ? 2U : 4U; + } + + // Create window and update padding + Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); + return std::make_pair(Status{}, win); + } + else if(data_layout == DataLayout::NCHW) + { + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int kernel_size = weights->dimension(width_idx); + + unsigned int num_elems_read_per_iteration_x = 0; + unsigned int num_elems_read_per_iteration_y = 0; + unsigned int num_elems_written_per_iteration_x = 0; + unsigned int num_elems_written_per_iteration_y = 0; + + unsigned int conv_pad_left = conv_info.pad_left(); + unsigned int conv_pad_top = conv_info.pad_top(); + unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + + setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, + num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, + kernel_size, conv_info, target, src); + + // Create window and update padding + bool window_changed = false; + Window win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y)); + + AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y); + AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size); + AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y); + window_changed = update_window_and_padding(win, input_access, weights_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); + } + else + { + ARM_COMPUTE_ERROR("Not supported"); + } +} + +bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout) +{ + if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) + { + return false; + } + + // If not floating point + if(!is_data_type_float(tensor->data_type())) + { + return false; + } + + if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) + { + return false; + } + + // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform + if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + { + return false; + } + + // Check cl image pitch alignment + if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) + { + return false; + } + + const size_t image_w = tensor->tensor_shape()[0] / 4; + const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3]; + const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); + const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); + + if(image_w > max_image_w || image_h > max_image_h) + { + return false; + } + + return true; +} + +} // namespace + +BorderSize ClDirectConv2dKernel::border_size() const +{ + return _border_size; +} + +void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + // Perform validation + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, + weights, + (biases != nullptr) ? biases : nullptr, + dst, + conv_info)); + + const int conv_stride_x = std::get<0>(conv_info.stride()); + const int conv_stride_y = std::get<1>(conv_info.stride()); + + _data_layout = src->data_layout(); + _conv_info = conv_info; + + const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const unsigned int kernel_size = weights->dimension(width_idx); + const DataType data_type = src->data_type(); + + const GPUTarget gpu_target = get_target(); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + std::stringstream kernel_name; + CLBuildOptions build_options; + + if(_data_layout == DataLayout::NHWC) + { + _border_size = BorderSize(); + + kernel_name << "direct_convolution_nhwc"; + + const unsigned int n0 = win_config.second.x().step(); + const unsigned int m0 = win_config.second.y().step(); + const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx)); + const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0; + const unsigned int pad_left = conv_info.pad_left(); + const unsigned int pad_top = conv_info.pad_top(); + const bool export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout); + + // Update the padding for the weights tensor if we can export to cl_image + if(export_to_cl_image) + { + gemm::update_padding_for_cl_image(weights); + } + + if(biases != nullptr) + { + build_options.add_option(std::string("-DHAS_BIAS")); + build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type()))); + } + + build_options.add_option("-cl-fast-relaxed-math"); + build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER"); + build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx))); + build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx))); + build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx))); + build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_options.add_option("-DDST_TENSOR_TYPE=BUFFER"); + build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx))); + build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx))); + build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx))); + build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); + build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); + build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); + build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type())); + build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)); + build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y)); + build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left)); + build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top)); + build_options.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_options.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_options.add_option("-DK0=" + support::cpp11::to_string(k0)); + build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); + + if(is_data_type_quantized(data_type)) + { + const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); + + PixelValue zero_value = PixelValue(0, src->data_type(), src->quantization_info()); + int zero_value_s32; + zero_value.get(zero_value_s32); + + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_options.add_option("-DIS_QUANTIZED"); + build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift)); + build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); + build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); + build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); + build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32)); + build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32)); + } + else + { + build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0)); + build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0)); + build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0)); + build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0)); + } + } + else + { + _border_size = BorderSize(src->padding()); + + kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size; + + build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS")); + + const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout); + + if(run_optimized_for_bifrost) + { + build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); + + kernel_name << "_f32_bifrost"; + } + else + { + build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); + build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); + build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); + build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x))); + build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); + + if(is_data_type_quantized(data_type)) + { + const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); + + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)); + build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); + build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); + build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); + + kernel_name.str("direct_convolution_quantized"); + } + } + } + + _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name.str(); + _config_id += "_"; + _config_id += lower_string(string_from_data_type(data_type)); + _config_id += "_"; + _config_id += support::cpp11::to_string(kernel_size); + _config_id += "_"; + _config_id += support::cpp11::to_string(border_size().left); + _config_id += "_"; + _config_id += support::cpp11::to_string(border_size().top); + _config_id += "_"; + _config_id += support::cpp11::to_string(border_size().right); + _config_id += "_"; + _config_id += support::cpp11::to_string(border_size().bottom); + _config_id += "_"; + _config_id += support::cpp11::to_string(conv_stride_x); + _config_id += "_"; + _config_id += support::cpp11::to_string(conv_stride_y); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(width_idx)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(height_idx)); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(_data_layout)); +} + +Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + const GPUTarget target) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first); + + return Status{}; +} + +void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + // Get initial windows + Window slice = window.first_slice_window_3D(); + + const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto weights = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto biases = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + if(_data_layout == DataLayout::NHWC) + { + cl::Image2D weights_cl_image; + + const size_t dim_y_collapsed = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step()); + const bool export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout); + + slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step())); + slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1)); + + if(export_to_cl_image) + { + const size_t image_w = weights->info()->dimension(0) / 4; + const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); + const TensorShape shape2d(image_w, image_h); + const size_t image_row_pitch = weights->info()->strides_in_bytes()[1]; + + // Export cl_buffer to cl_image + weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch); + } + + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, slice); + add_4D_tensor_argument(idx, dst, slice); + if(export_to_cl_image) + { + _kernel.setArg(idx++, weights_cl_image); + } + add_4D_tensor_argument(idx, weights, slice); + if(biases != nullptr) + { + add_1D_tensor_argument(idx, biases, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } + else + { + Window win_in = window; + + win_in.adjust(Window::DimX, -_conv_info.pad_left(), true); + win_in.adjust(Window::DimY, -_conv_info.pad_top(), true); + + const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + + const int conv_stride_x = std::get<0>(_conv_info.stride()); + const int conv_stride_y = std::get<1>(_conv_info.stride()); + + win_in.set_dimension_step(width_idx, window[width_idx].step() * conv_stride_x); + win_in.set_dimension_step(height_idx, window[height_idx].step() * conv_stride_y); + + Window slice_in = win_in.first_slice_window_3D(); + unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); + add_3D_tensor_argument(idx1, weights, slice); + + if(biases != nullptr) + { + Window slice_biases; + slice_biases.use_tensor_dimensions(biases->info()->tensor_shape()); + add_1D_tensor_argument(idx1, biases, slice_biases); + } + + _kernel.setArg(idx1++, static_cast(weights->info()->strides_in_bytes()[3])); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice_in); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in)); + } +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h new file mode 100644 index 0000000000..ec76624e5c --- /dev/null +++ b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H +#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the direct convolution kernel. + */ +class ClDirectConv2dKernel : public IClKernel +{ +public: + ClDirectConv2dKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel); + /** Set the src, weights, biases and dst tensors info. + * + * @note: Due to set_valid_region(), thus src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed. + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 + * 3x3 convolution with stride_x = 1/2, stride_y = 1/2 + * 5x5 convolution with stride_x = 1/2, stride_y = 1/2 + * 9x9 convolution with stride_x = 1/2, stride_y = 1/2 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * The 3rd dimension must be the same as the src's volume 3rd dimension. + * Data type supported:Same as @p src. + * @param[in] biases Biases tensor info. Biases are 1D tensor with dimension [OFM]. + * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[out] dst Output tensor info. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClDirectConv2dKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +public: + DataLayout _data_layout{}; + BorderSize _border_size{}; + PadStrideInfo _conv_info{}; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /*ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp deleted file mode 100644 index 0a5101f564..0000000000 --- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp +++ /dev/null @@ -1,665 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLUtils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - - const DataLayout data_layout = src->data_layout(); - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), - "Weights feature map dimension should match the respective src's one"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) - && std::get<0>(conv_info.stride()) > 2, - "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution."); - - if(data_layout == DataLayout::NCHW) - { - if(is_data_type_quantized(src->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9, - "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5, - "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types"); - } - } - - if(biases != nullptr) - { - if(is_data_type_quantized_asymmetric(src->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3), - "Biases size and number of src feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, - "Biases should be one dimensional"); - } - - // Checks performed when dst is configured - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - const auto data_type = src->data_type(); - if(is_data_type_quantized(data_type)) - { - const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); - const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - - float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int output_multiplier = 0; - int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - } - return Status{}; -} - -inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size, - DataType data_type, DataLayout data_layout) -{ - return gpu_target_is_in(gpu_target, - GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, - GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, - GPUTarget::G52, GPUTarget::G52LIT) - && (kernel_size <= 5) - && (conv_stride_x == 1) && (conv_stride_y == 1) - && (data_type == DataType::F32) - && (data_layout == DataLayout::NCHW); -} - -inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y, - unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y, - unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *src) -{ - const DataType data_type = src->data_type(); - const DataLayout data_layout = src->data_layout(); - unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - - const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout); - - if(run_optimized_bifrost) - { - // Configure kernel window - switch(kernel_size) - { - case 1: - { - num_elems_read_per_iteration_x = 4; - num_elems_read_per_iteration_y = 4; - num_elems_written_per_iteration_x = 4; - num_elems_written_per_iteration_y = 4; - break; - } - case 3: - { - num_elems_read_per_iteration_x = 6; - num_elems_read_per_iteration_y = 5; - num_elems_written_per_iteration_x = 4; - num_elems_written_per_iteration_y = 3; - break; - } - case 5: - { - num_elems_read_per_iteration_x = 8; - num_elems_read_per_iteration_y = 6; - num_elems_written_per_iteration_x = 4; - num_elems_written_per_iteration_y = 2; - break; - } - default: - { - ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost"); - } - } - } - else - { - num_elems_read_per_iteration_y = kernel_size; - num_elems_written_per_iteration_x = 8; - num_elems_written_per_iteration_y = 1; - switch(kernel_size) - { - case 1: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 8; - break; - case 2: - num_elems_read_per_iteration_x = 16; - break; - case 3: - switch(src->element_size()) - { - case 1: - num_elems_read_per_iteration_x = 28; - break; - case 2: - num_elems_read_per_iteration_x = 24; - break; - case 4: - num_elems_read_per_iteration_x = 22; - break; - default: - ARM_COMPUTE_ERROR("Invalid data size"); - } - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 3: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 10; - break; - case 2: - num_elems_read_per_iteration_x = 17; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 5: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 12; - break; - case 2: - num_elems_read_per_iteration_x = 20; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 9: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 16; - break; - case 2: - num_elems_read_per_iteration_x = 24; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - default: - ARM_COMPUTE_ERROR("Invalid direct convolution size"); - } - } -} - -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target) -{ - const DataLayout data_layout = src->data_layout(); - - // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, output_shape, - 1, - src->data_type(), - src->quantization_info()); - - if(data_layout == DataLayout::NHWC) - { - const unsigned int vec_size = std::min(static_cast(dst->tensor_shape()[0]), 4u); - unsigned int num_rows = 1U; - if(dst->tensor_shape()[0] > 16) - { - num_rows = src->data_type() == DataType::F32 ? 2U : 4U; - } - - // Create window and update padding - Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); - return std::make_pair(Status{}, win); - } - else if(data_layout == DataLayout::NCHW) - { - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int kernel_size = weights->dimension(width_idx); - - unsigned int num_elems_read_per_iteration_x = 0; - unsigned int num_elems_read_per_iteration_y = 0; - unsigned int num_elems_written_per_iteration_x = 0; - unsigned int num_elems_written_per_iteration_y = 0; - - unsigned int conv_pad_left = conv_info.pad_left(); - unsigned int conv_pad_top = conv_info.pad_top(); - unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - - setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, - num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, - kernel_size, conv_info, target, src); - - // Create window and update padding - bool window_changed = false; - Window win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y)); - - AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y); - AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size); - AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y); - window_changed = update_window_and_padding(win, input_access, weights_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); - } - else - { - ARM_COMPUTE_ERROR("Not supported"); - } -} - -bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout) -{ - if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) - { - return false; - } - - // If not floating point - if(!is_data_type_float(tensor->data_type())) - { - return false; - } - - if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) - { - return false; - } - - // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform - if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) - { - return false; - } - - // Check cl image pitch alignment - if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) - { - return false; - } - - const size_t image_w = tensor->tensor_shape()[0] / 4; - const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3]; - const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); - const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); - - if(image_w > max_image_w || image_h > max_image_h) - { - return false; - } - - return true; -} - -} // namespace - -BorderSize ClDirectConvolutionKernel::border_size() const -{ - return _border_size; -} - -void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - - // Perform validation - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, - weights, - (biases != nullptr) ? biases : nullptr, - dst, - conv_info)); - - const int conv_stride_x = std::get<0>(conv_info.stride()); - const int conv_stride_y = std::get<1>(conv_info.stride()); - - _data_layout = src->data_layout(); - _conv_info = conv_info; - - const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const unsigned int kernel_size = weights->dimension(width_idx); - const DataType data_type = src->data_type(); - - const GPUTarget gpu_target = get_target(); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - std::stringstream kernel_name; - CLBuildOptions build_options; - - if(_data_layout == DataLayout::NHWC) - { - _border_size = BorderSize(); - - kernel_name << "direct_convolution_nhwc"; - - const unsigned int n0 = win_config.second.x().step(); - const unsigned int m0 = win_config.second.y().step(); - const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx)); - const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0; - const unsigned int pad_left = conv_info.pad_left(); - const unsigned int pad_top = conv_info.pad_top(); - const bool export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout); - - // Update the padding for the weights tensor if we can export to cl_image - if(export_to_cl_image) - { - gemm::update_padding_for_cl_image(weights); - } - - if(biases != nullptr) - { - build_options.add_option(std::string("-DHAS_BIAS")); - build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type()))); - } - - build_options.add_option("-cl-fast-relaxed-math"); - build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER"); - build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx))); - build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx))); - build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx))); - build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_options.add_option("-DDST_TENSOR_TYPE=BUFFER"); - build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx))); - build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx))); - build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx))); - build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); - build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); - build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); - build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type())); - build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)); - build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y)); - build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left)); - build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top)); - build_options.add_option("-DN0=" + support::cpp11::to_string(n0)); - build_options.add_option("-DM0=" + support::cpp11::to_string(m0)); - build_options.add_option("-DK0=" + support::cpp11::to_string(k0)); - build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); - - if(is_data_type_quantized(data_type)) - { - const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); - const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - - PixelValue zero_value = PixelValue(0, src->data_type(), src->quantization_info()); - int zero_value_s32; - zero_value.get(zero_value_s32); - - float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); - build_options.add_option("-DIS_QUANTIZED"); - build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift)); - build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); - build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); - build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); - build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32)); - build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32)); - } - else - { - build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0)); - build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0)); - build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0)); - build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0)); - } - } - else - { - _border_size = BorderSize(src->padding()); - - kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size; - - build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS")); - - const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout); - - if(run_optimized_for_bifrost) - { - build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); - - kernel_name << "_f32_bifrost"; - } - else - { - build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); - build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); - build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); - build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x))); - build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); - - if(is_data_type_quantized(data_type)) - { - const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); - const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - - float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); - build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); - build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)); - build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); - build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); - build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); - - kernel_name.str("direct_convolution_quantized"); - } - } - } - - _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name.str(); - _config_id += "_"; - _config_id += lower_string(string_from_data_type(data_type)); - _config_id += "_"; - _config_id += support::cpp11::to_string(kernel_size); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_size().left); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_size().top); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_size().right); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_size().bottom); - _config_id += "_"; - _config_id += support::cpp11::to_string(conv_stride_x); - _config_id += "_"; - _config_id += support::cpp11::to_string(conv_stride_y); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(width_idx)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(height_idx)); - _config_id += "_"; - _config_id += lower_string(string_from_data_layout(_data_layout)); -} - -Status ClDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const GPUTarget target) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first); - - return Status{}; -} - -void ClDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - // Get initial windows - Window slice = window.first_slice_window_3D(); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto weights = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto biases = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - if(_data_layout == DataLayout::NHWC) - { - cl::Image2D weights_cl_image; - - const size_t dim_y_collapsed = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step()); - const bool export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout); - - slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step())); - slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1)); - - if(export_to_cl_image) - { - const size_t image_w = weights->info()->dimension(0) / 4; - const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); - const TensorShape shape2d(image_w, image_h); - const size_t image_row_pitch = weights->info()->strides_in_bytes()[1]; - - // Export cl_buffer to cl_image - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch); - } - - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, slice); - add_4D_tensor_argument(idx, dst, slice); - if(export_to_cl_image) - { - _kernel.setArg(idx++, weights_cl_image); - } - add_4D_tensor_argument(idx, weights, slice); - if(biases != nullptr) - { - add_1D_tensor_argument(idx, biases, slice); - } - enqueue(queue, *this, slice, lws_hint()); - } - else - { - Window win_in = window; - - win_in.adjust(Window::DimX, -_conv_info.pad_left(), true); - win_in.adjust(Window::DimY, -_conv_info.pad_top(), true); - - const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - - const int conv_stride_x = std::get<0>(_conv_info.stride()); - const int conv_stride_y = std::get<1>(_conv_info.stride()); - - win_in.set_dimension_step(width_idx, window[width_idx].step() * conv_stride_x); - win_in.set_dimension_step(height_idx, window[height_idx].step() * conv_stride_y); - - Window slice_in = win_in.first_slice_window_3D(); - unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); - add_3D_tensor_argument(idx1, weights, slice); - - if(biases != nullptr) - { - Window slice_biases; - slice_biases.use_tensor_dimensions(biases->info()->tensor_shape()); - add_1D_tensor_argument(idx1, biases, slice_biases); - } - - _kernel.setArg(idx1++, static_cast(weights->info()->strides_in_bytes()[3])); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice_in); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in)); - } -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h b/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h deleted file mode 100644 index 384b561003..0000000000 --- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H -#define ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the direct convolution kernel. - */ -class ClDirectConvolutionKernel : public IClKernel -{ -public: - ClDirectConvolutionKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConvolutionKernel); - /** Set the src, weights, biases and dst tensors info. - * - * @note: Due to set_valid_region(), thus src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed. - * - * @note: DirectConvolution only works in the following configurations: - * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 - * 3x3 convolution with stride_x = 1/2, stride_y = 1/2 - * 5x5 convolution with stride_x = 1/2, stride_y = 1/2 - * 9x9 convolution with stride_x = 1/2, stride_y = 1/2 - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * The 3rd dimension must be the same as the src's volume 3rd dimension. - * Data type supported:Same as @p src. - * @param[in] biases Biases tensor info. Biases are 1D tensor with dimension [OFM]. - * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type - * @param[out] dst Output tensor info. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info); - /** Static function to check if given info will lead to a valid configuration of @ref ClDirectConvolutionKernel - * - * @param[in] src The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * The 3rd dimension must be the same as the src's volume 3rd dimension. - * Data type supported:Same as @p src. - * @param[in] biases Biases tensor info. Biases are 1D tensor with dimension [OFM]. - * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type. - * @param[in] dst Output tensor info. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] target Target GPU architecture. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -public: - DataLayout _data_layout{}; - BorderSize _border_size{}; - PadStrideInfo _conv_info{}; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /*ARM_COMPUTE_CL_DIRECT_CONVOLUTION_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.cpp b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp new file mode 100644 index 0000000000..0e15bffd14 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp @@ -0,0 +1,509 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClPool2dKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +// Internal window config info +using ClPoolingConfig = std::pair; //num_elems_processed_per_iteration, border_size + +void auto_init(const ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, PoolingLayerInfo pool_info) +{ + TensorShape out_shape = compute_pool_shape(*src, pool_info); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape)); + if(indices) + { + auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32)); + } +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2), + "Unsupported combination of parameters!"); + + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const bool is_global_pooling = pool_info.is_global_pooling; + unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + int output_width = 0; + int output_height = 0; + std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], + pool_size_x, pool_size_y, pool_info.pad_stride_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid"); + + // Check indices + if(indices) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); + + if(indices->total_size() != 0) + { + TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info); + } + } + + // Checks performed when dst is configured + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); + } + + return Status{}; +} + +std::tuple validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Get data layout + const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + int pool_stride_x = 0; + int pool_stride_y = 0; + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + const int pool_pad_right = pad_stride_info.pad_right(); + const int pool_pad_top = pad_stride_info.pad_top(); + const int pool_pad_left = pad_stride_info.pad_left(); + const int pool_pad_bottom = pad_stride_info.pad_bottom(); + BorderSize border_size = BorderSize(); + + auto_init(src, dst, indices, pool_info); + pooled_w = dst->tensor_shape()[idx_width]; + pooled_h = dst->tensor_shape()[idx_height]; + + const DataType data_type = src->data_type(); + + const int src_width = src->dimension(idx_width); + const int src_height = src->dimension(idx_height); + + unsigned int num_elems_processed_per_iteration = 0; + bool window_changed = false; + Window win{}; + switch(data_layout) + { + case DataLayout::NCHW: + { + // Initialize border size + border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); + // Change the number of elements processed per iteration + // for pooling 3x3 with stride less equal than 3 + const bool can_optimize = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type); + num_elems_processed_per_iteration = can_optimize ? 4 : 1; + const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x; + + // Number of iterations in X dimension + const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration; + + // Upper limit for the number of right/bottom border elements that are accessed + const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height; + + border_size.right = std::max(upper_bound_w, pool_pad_right); + border_size.bottom = std::max(upper_bound_h, pool_pad_bottom); + + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); + + AccessWindowRectangle src_access(src, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y, + pool_stride_x, pool_stride_y); + AccessWindowHorizontal dst_access(dst, 0, num_elems_processed_per_iteration); + + // Update indices window + if(indices) + { + AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, src_access, dst_access, indices_access); + indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape())); + } + else + { + window_changed = update_window_and_padding(win, src_access, dst_access); + } + + dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); + break; + } + case DataLayout::NHWC: + { + const size_t vec_size = dst->data_type() == DataType::F32 ? 2 : 4; + + // Initialize border size + border_size = BorderSize(); + num_elems_processed_per_iteration = adjust_vec_size(vec_size, dst->dimension(0)); + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); + break; + } + default: + ARM_COMPUTE_ERROR("Not implemented"); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_tuple(err, win, ClPoolingConfig(num_elems_processed_per_iteration, border_size)); +} +} // namespace + +ClPool2dKernel::ClPool2dKernel() + : _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1) +{ +} + +BorderSize ClPool2dKernel::border_size() const +{ + return _border_size; +} + +void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + auto padding_info = get_padding_info({ src, dst, indices }); + + // Set instance variables + _pool_info = pool_info; + _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + int pool_stride_x = 0; + int pool_stride_y = 0; + const PoolingType pool_type = pool_info.pool_type; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); + const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + const bool exclude_padding = pool_info.exclude_padding; + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + const int pool_pad_top = pad_stride_info.pad_top(); + const int pool_pad_left = pad_stride_info.pad_left(); + + // Set build options + CLBuildOptions build_opts; + const DataType data_type = src->data_type(); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst, pool_info, indices); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + ICLKernel::configure_internal(std::get<1>(win_config)); + + ClPoolingConfig pooling_config = std::get<2>(win_config); + _num_elems_processed_per_iteration = pooling_config.first; + _border_size = pooling_config.second; + + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)); + + // Tensor paddings are used to calculate the indicies for MAX pooling + if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) + { + build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(src->padding().left)); + build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(src->padding().right)); + build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(src->padding().top)); + build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(src->padding().bottom)); + build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(src->dimension(idx_channel))); + build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width))); + build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); + } + + if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info()) + { + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); + } + + // Check dst dimensions + auto_init(src, dst, indices, pool_info); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices)); + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type)); + build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)); + build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)); + build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left)); + build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top)); + build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x)); + build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y)); + + // Set the initial value for the pooling operation accordingly with the data type + if(pool_type == PoolingType::MAX) + { + if(is_data_type_quantized(data_type)) + { + PixelValue type_min{}; + std::tie(type_min, std::ignore) = get_min_max(data_type); + build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get())); + } + else + { + build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits::lowest())); + } + } + else + { + // Pool AVG and Pool L2 initial value + build_opts.add_option("-DINITIAL_VALUE=0"); + } + + build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left))); + build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top))); + + // Create kernel + switch(_data_layout) + { + case DataLayout::NCHW: + { + const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision; + const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX); + const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type); + build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type); + build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION"); + + if(pool_type != PoolingType::MAX) + { + build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); + } + + if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type)) + { + // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where + // each thread computes 4 dst elements + const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3); + + std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_") + + support::cpp11::to_string(pool_size_x); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) + { + // For max pooling with pool2x2, store indicies which will be used in max unpooling + if(data_type == DataType::F32) + { + std::string kernel_name = "pooling_layer_2_nchw_indices_fp32"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + else if(data_type == DataType::F16) + { + std::string kernel_name = "pooling_layer_2_nchw_indices_fp16"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + } + else // Run general case + { + std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + break; + } + case DataLayout::NHWC: + { + // Floating point mixed precision is support on F16 only + const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; + + // Wider accumulation is required to avoid accuracy loss + // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation) + // Cast 2: Quantized (int8/uint8 src data and int32 accumulation ) + DataType acc_data_type = data_type; + + if(use_fp_mixed_precision) + { + acc_data_type = DataType::F32; + } + else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) + { + acc_data_type = DataType::S32; + } + + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type)); + build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION"); + build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width))); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); + build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height))); + build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel))); + build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size))); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); + if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type)) + { + build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX"); + + std::string kernel_name = "pooling_layer_2x2_nhwc"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + else + { + std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + break; + } + default: + ARM_COMPUTE_ERROR("Not implemented"); + } + + // Set config_id for enabling LWS tuning + _config_id = "pooling_layer_"; + _config_id += lower_string(string_from_data_type(data_type)); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(_data_layout)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(idx_width)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(idx_height)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(idx_channel)); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(src->data_layout())); + + ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info)); +} + +Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info))); + + return Status{}; +} + +void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + unsigned int pool_stride_x = 0; + unsigned int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride(); + + const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_0)); + auto indices = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_1)); + + // Collapse window + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + + switch(_data_layout) + { + case DataLayout::NCHW: + { + Window slice = window_collapsed.first_slice_window_3D(); + do + { + // Upsample src by pool size + Window in_slice(slice); + in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(), + (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x, + pool_stride_x * _num_elems_processed_per_iteration)); + in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(), + (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y, + pool_stride_y)); + + // Set srcs + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, in_slice); + add_3D_tensor_argument(idx, dst, slice); + if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2))) + { + add_3D_tensor_argument(idx, indices, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } + while(window_collapsed.slide_window_slice_3D(slice)); + break; + } + case DataLayout::NHWC: + { + const size_t batch_size = dst->info()->tensor_shape().total_size_upper(3); + + Window slice = window_collapsed.first_slice_window_4D(); + Window in_slice = window_collapsed.first_slice_window_4D(); + in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration)); + in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x)); + in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y)); + in_slice.set(3, Window::Dimension(0, batch_size, 1)); + do + { + // Set srcs + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, in_slice); + add_4D_tensor_argument(idx, dst, slice); + if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) + { + add_4D_tensor_argument(idx, indices, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice)); + break; + } + default: + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.h b/src/core/gpu/cl/kernels/ClPool2dKernel.h new file mode 100644 index 0000000000..8ecb8eb7b7 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClPool2dKernel.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_POOL2D_KERNEL_H +#define ARM_COMPUTE_CL_POOL2D_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the pooling layer kernel */ +class ClPool2dKernel : public IClKernel +{ +public: + /** Default constructor */ + ClPool2dKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool2dKernel); + + /** Configure kernel for a given list of arguments + * + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClPool2dKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +public: + PoolingLayerInfo _pool_info; + DataLayout _data_layout; + BorderSize _border_size; + unsigned int _num_elems_processed_per_iteration; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_POOL2D_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClPoolingKernel.cpp b/src/core/gpu/cl/kernels/ClPoolingKernel.cpp deleted file mode 100644 index 08a3ce3784..0000000000 --- a/src/core/gpu/cl/kernels/ClPoolingKernel.cpp +++ /dev/null @@ -1,509 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClPoolingKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -using namespace arm_compute::misc::shape_calculator; - -namespace -{ -// Internal window config info -using ClPoolingConfig = std::pair; //num_elems_processed_per_iteration, border_size - -void auto_init(const ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, PoolingLayerInfo pool_info) -{ - TensorShape out_shape = compute_pool_shape(*src, pool_info); - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape)); - if(indices) - { - auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32)); - } -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2), - "Unsupported combination of parameters!"); - - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const bool is_global_pooling = pool_info.is_global_pooling; - unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - int output_width = 0; - int output_height = 0; - std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], - pool_size_x, pool_size_y, pool_info.pad_stride_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid"); - - // Check indices - if(indices) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); - - if(indices->total_size() != 0) - { - TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info); - } - } - - // Checks performed when dst is configured - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); - } - - return Status{}; -} - -std::tuple validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Get data layout - const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - int pool_stride_x = 0; - int pool_stride_y = 0; - unsigned int pooled_w = 0; - unsigned int pooled_h = 0; - int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_pad_right = pad_stride_info.pad_right(); - const int pool_pad_top = pad_stride_info.pad_top(); - const int pool_pad_left = pad_stride_info.pad_left(); - const int pool_pad_bottom = pad_stride_info.pad_bottom(); - BorderSize border_size = BorderSize(); - - auto_init(src, dst, indices, pool_info); - pooled_w = dst->tensor_shape()[idx_width]; - pooled_h = dst->tensor_shape()[idx_height]; - - const DataType data_type = src->data_type(); - - const int src_width = src->dimension(idx_width); - const int src_height = src->dimension(idx_height); - - unsigned int num_elems_processed_per_iteration = 0; - bool window_changed = false; - Window win{}; - switch(data_layout) - { - case DataLayout::NCHW: - { - // Initialize border size - border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); - // Change the number of elements processed per iteration - // for pooling 3x3 with stride less equal than 3 - const bool can_optimize = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type); - num_elems_processed_per_iteration = can_optimize ? 4 : 1; - const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x; - - // Number of iterations in X dimension - const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration; - - // Upper limit for the number of right/bottom border elements that are accessed - const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height; - - border_size.right = std::max(upper_bound_w, pool_pad_right); - border_size.bottom = std::max(upper_bound_h, pool_pad_bottom); - - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); - - AccessWindowRectangle src_access(src, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y, - pool_stride_x, pool_stride_y); - AccessWindowHorizontal dst_access(dst, 0, num_elems_processed_per_iteration); - - // Update indices window - if(indices) - { - AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration); - window_changed = update_window_and_padding(win, src_access, dst_access, indices_access); - indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape())); - } - else - { - window_changed = update_window_and_padding(win, src_access, dst_access); - } - - dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); - break; - } - case DataLayout::NHWC: - { - const size_t vec_size = dst->data_type() == DataType::F32 ? 2 : 4; - - // Initialize border size - border_size = BorderSize(); - num_elems_processed_per_iteration = adjust_vec_size(vec_size, dst->dimension(0)); - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); - break; - } - default: - ARM_COMPUTE_ERROR("Not implemented"); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_tuple(err, win, ClPoolingConfig(num_elems_processed_per_iteration, border_size)); -} -} // namespace - -ClPoolingKernel::ClPoolingKernel() - : _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1) -{ -} - -BorderSize ClPoolingKernel::border_size() const -{ - return _border_size; -} - -void ClPoolingKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - auto padding_info = get_padding_info({ src, dst, indices }); - - // Set instance variables - _pool_info = pool_info; - _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - int pool_stride_x = 0; - int pool_stride_y = 0; - const PoolingType pool_type = pool_info.pool_type; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); - const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - const bool exclude_padding = pool_info.exclude_padding; - std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_pad_top = pad_stride_info.pad_top(); - const int pool_pad_left = pad_stride_info.pad_left(); - - // Set build options - CLBuildOptions build_opts; - const DataType data_type = src->data_type(); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, pool_info, indices); - - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - ICLKernel::configure_internal(std::get<1>(win_config)); - - ClPoolingConfig pooling_config = std::get<2>(win_config); - _num_elems_processed_per_iteration = pooling_config.first; - _border_size = pooling_config.second; - - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)); - - // Tensor paddings are used to calculate the indicies for MAX pooling - if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) - { - build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(src->padding().left)); - build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(src->padding().right)); - build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(src->padding().top)); - build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(src->padding().bottom)); - build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(src->dimension(idx_channel))); - build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width))); - build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); - } - - if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info()) - { - const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - - build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); - build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); - } - - // Check dst dimensions - auto_init(src, dst, indices, pool_info); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices)); - - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type)); - build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)); - build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)); - build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left)); - build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top)); - build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x)); - build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y)); - - // Set the initial value for the pooling operation accordingly with the data type - if(pool_type == PoolingType::MAX) - { - if(is_data_type_quantized(data_type)) - { - PixelValue type_min{}; - std::tie(type_min, std::ignore) = get_min_max(data_type); - build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get())); - } - else - { - build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits::lowest())); - } - } - else - { - // Pool AVG and Pool L2 initial value - build_opts.add_option("-DINITIAL_VALUE=0"); - } - - build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left))); - build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top))); - - // Create kernel - switch(_data_layout) - { - case DataLayout::NCHW: - { - const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision; - const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX); - const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type); - build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type); - build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION"); - - if(pool_type != PoolingType::MAX) - { - build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); - } - - if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type)) - { - // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where - // each thread computes 4 dst elements - const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3); - - std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_") - + support::cpp11::to_string(pool_size_x); - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) - { - // For max pooling with pool2x2, store indicies which will be used in max unpooling - if(data_type == DataType::F32) - { - std::string kernel_name = "pooling_layer_2_nchw_indices_fp32"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - else if(data_type == DataType::F16) - { - std::string kernel_name = "pooling_layer_2_nchw_indices_fp16"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - } - else // Run general case - { - std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - break; - } - case DataLayout::NHWC: - { - // Floating point mixed precision is support on F16 only - const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; - - // Wider accumulation is required to avoid accuracy loss - // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation) - // Cast 2: Quantized (int8/uint8 src data and int32 accumulation ) - DataType acc_data_type = data_type; - - if(use_fp_mixed_precision) - { - acc_data_type = DataType::F32; - } - else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) - { - acc_data_type = DataType::S32; - } - - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type)); - build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION"); - build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); - build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width))); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); - build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height))); - build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel))); - build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size))); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); - if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type)) - { - build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX"); - - std::string kernel_name = "pooling_layer_2x2_nhwc"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - else - { - std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - break; - } - default: - ARM_COMPUTE_ERROR("Not implemented"); - } - - // Set config_id for enabling LWS tuning - _config_id = "pooling_layer_"; - _config_id += lower_string(string_from_data_type(data_type)); - _config_id += "_"; - _config_id += lower_string(string_from_data_layout(_data_layout)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(idx_width)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(idx_height)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(idx_channel)); - _config_id += "_"; - _config_id += lower_string(string_from_data_layout(src->data_layout())); - - ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info)); -} - -Status ClPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info))); - - return Status{}; -} - -void ClPoolingKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - unsigned int pool_stride_x = 0; - unsigned int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride(); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_0)); - auto indices = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_1)); - - // Collapse window - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - - switch(_data_layout) - { - case DataLayout::NCHW: - { - Window slice = window_collapsed.first_slice_window_3D(); - do - { - // Upsample src by pool size - Window in_slice(slice); - in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(), - (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x, - pool_stride_x * _num_elems_processed_per_iteration)); - in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(), - (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y, - pool_stride_y)); - - // Set srcs - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, in_slice); - add_3D_tensor_argument(idx, dst, slice); - if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2))) - { - add_3D_tensor_argument(idx, indices, slice); - } - enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); - break; - } - case DataLayout::NHWC: - { - const size_t batch_size = dst->info()->tensor_shape().total_size_upper(3); - - Window slice = window_collapsed.first_slice_window_4D(); - Window in_slice = window_collapsed.first_slice_window_4D(); - in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration)); - in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x)); - in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y)); - in_slice.set(3, Window::Dimension(0, batch_size, 1)); - do - { - // Set srcs - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, in_slice); - add_4D_tensor_argument(idx, dst, slice); - if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) - { - add_4D_tensor_argument(idx, indices, slice); - } - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice)); - break; - } - default: - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClPoolingKernel.h b/src/core/gpu/cl/kernels/ClPoolingKernel.h deleted file mode 100644 index c1ce859e2c..0000000000 --- a/src/core/gpu/cl/kernels/ClPoolingKernel.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_POOLING_KERNEL_H -#define ARM_COMPUTE_CL_POOLING_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the pooling layer kernel */ -class ClPoolingKernel : public IClKernel -{ -public: - /** Default constructor */ - ClPoolingKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPoolingKernel); - - /** Configure kernel for a given list of arguments - * - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. - * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); - /** Static function to check if given info will lead to a valid configuration of @ref ClPoolingKernel - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p src. - * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. - * @param[in] indices (optional) The indices of the maximal values. Data type supported: U32. - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -public: - PoolingLayerInfo _pool_info; - DataLayout _data_layout; - BorderSize _border_size; - unsigned int _num_elems_processed_per_iteration; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /*ARM_COMPUTE_CL_POOLING_KERNEL_H */ -- cgit v1.2.1