diff options
Diffstat (limited to 'src/core/CL/kernels')
9 files changed, 390 insertions, 71 deletions
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp index eb561faf77..19cc649c96 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp @@ -246,13 +246,12 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, int output_shift = 0; quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + build_opts.add_option("-DREAL_MULTIPLIER=" + support::cpp11::to_string(multiplier)); build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y)); build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset)); build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset)); - build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); if(act_info.enabled()) { diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp index d3bed87037..93d96dad1b 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp @@ -159,8 +159,9 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 2); ARM_COMPUTE_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1); - const bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type()); - const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); + const bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type()); + const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); _input = input; _output = output; @@ -169,7 +170,14 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, _conv_stride_y = conv_info.stride().second; _num_rows_processed_per_iteration = is_stride_1 ? 2 : 1; _num_planes_processed_per_iteration = is_stride_1 ? 2 : 1; - _border_size = BorderSize(conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0); + + // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1 + if(is_dot8_supported && is_qasymm) + { + _num_planes_processed_per_iteration = 1; + } + + _border_size = BorderSize(is_qasymm && is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0); const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->info()->element_size()); @@ -187,13 +195,12 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, int output_shift = 0; quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + build_opts.add_option("-DREAL_MULTIPLIER=" + support::cpp11::to_string(multiplier)); build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1))); build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset)); build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset)); - build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); if(act_info.enabled()) { @@ -240,9 +247,8 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, } // Create kernel - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); - std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported - && is_stride_1 /* FIXME (COMPMID-1424) */) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : ""); + std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported + && is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : ""); _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp index ae54e77972..f333c1bff3 100644 --- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp +++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp @@ -115,7 +115,7 @@ CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel() { } -void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d) +void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d, bool unroll_block) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -132,6 +132,7 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out // Create build options CLBuildOptions build_opts; build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height)); + build_opts.add_option_if(unroll_block, "-DUNROLL_BLOCK"); build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1))); build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2))); diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp index 99e184050e..73b1d41eb1 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp @@ -108,6 +108,7 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, ElementsProcessed &num_elements_processed) { + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); @@ -126,7 +127,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe } // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info))); + auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)).set_data_type(DataType::S32)); TensorInfo tmp_info(*output); @@ -173,8 +174,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe else { // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x - num_elems_processed_per_iteration_x = 4; - num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 5); + // Note: if the dot product instruction is available, the 8x2 tile has to be used + num_elems_processed_per_iteration_x = is_dot8_supported ? 8 : 4; + num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), is_dot8_supported ? 2 : 4); // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic @@ -270,6 +272,7 @@ void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const IC // the correct step which is calculated as (16 * mult_transpose1xW_width) / 4) build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))); + build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width)); build_opts.add_option("-DTRANSPOSE1XW_WIDTH_STEP=" + support::cpp11::to_string(4 * mult_transpose1xW_width)); build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height)); diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp index 3888353ee7..d348f2c06d 100644 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp @@ -46,11 +46,18 @@ class Coordinates; namespace { -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); + } + // If a_offset == 0, vector_sum_col can be a nullptr if(a_offset != 0) { @@ -64,11 +71,11 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Validate input ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row != nullptr && vector_sum_row->dimension(0) != mm_result->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = mm_result->tensor_shape(); if(output_shape.num_dimensions() > 1) @@ -96,7 +103,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, int32_t a_offset, int32_t b_offset) { constexpr unsigned int num_elems_processed_per_iteration = 4; @@ -119,28 +126,37 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access); } + if(bias != nullptr) + { + AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]); + window_changed = window_changed || update_window_and_padding(win, bias_access); + } + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr) + : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr) { } -void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, + int32_t b_offset) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, + bias != nullptr ? bias->info() : nullptr, a_offset, b_offset)); // NOLINT _vector_sum_col = vector_sum_col; _vector_sum_row = vector_sum_row; _mm_result = mm_result; + _bias = bias; // Check if input is a 3D reinterpretation const bool reinterpret_as_3d = vector_sum_row != nullptr @@ -161,20 +177,24 @@ void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const I build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1))); build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2))); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + std::string kernel_name("gemmlowp_offset_contribution"); // Create kernel - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); // Configure kernel window auto win_config = validate_and_configure_window(mm_result->info(), vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, + bias != nullptr ? bias->info() : nullptr, a_offset, b_offset); // NOLINT ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); // Set config_id for enabling LWS tuning - _config_id = "gemmlowp_offset_contribution_"; + _config_id = kernel_name + "_"; _config_id += support::cpp11::to_string(mm_result->info()->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(mm_result->info()->dimension(1)); @@ -182,13 +202,14 @@ void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const I _config_id += support::cpp11::to_string(mm_result->info()->dimension(2)); } -Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, +Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + bias != nullptr ? bias->clone().get() : nullptr, a_offset, b_offset) .first); // NOLINT @@ -214,6 +235,10 @@ void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQu win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Window biases_slice = slice; + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + do { unsigned int idx = 0; @@ -226,7 +251,11 @@ void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQu { add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row); } - enqueue(queue, *this, slice); + if(_bias != nullptr) + { + add_1D_tensor_argument(idx, _bias, biases_slice); + } + enqueue(queue, *this, slice, lws_hint()); } while(collapsed.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp new file mode 100644 index 0000000000..83af0c63eb --- /dev/null +++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "support/ToolchainSupport.h" + +#include <cstddef> +#include <cstdint> + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, + int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE); + ARM_COMPUTE_RETURN_ERROR_ON(bias == nullptr && a_offset == 0 && b_offset == 0); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); + } + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(b_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); + + TensorShape output_shape = mm_result->tensor_shape(); + if(output_shape.num_dimensions() > 1) + { + const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + output_shape.collapse_from(output_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], + "mm_result tensor must have the same number of batches of output tensor"); + + if(a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + } + } + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, ITensorInfo *output, + int32_t a_offset, int32_t b_offset) +{ + constexpr unsigned int num_elems_processed_per_iteration = 4; + bool window_changed = false; + + // Auto initialize the output + auto_init_if_empty(*output, mm_result->clone()->set_data_type(DataType::QASYMM8)); + + // Configure kernel window + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, mm_result_access); + + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, output_access); + + if(a_offset != 0) + { + AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access); + } + if(b_offset != 0) + { + AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT + window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access); + } + + if(bias != nullptr) + { + AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]); + window_changed = window_changed || update_window_and_padding(win, bias_access); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel() + : _mm_result(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _output(nullptr) +{ +} + +void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, + int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, + bias != nullptr ? bias->info() : nullptr, + output->info(), + a_offset, b_offset, output_stage)); // NOLINT + + const int min = output_stage.gemmlowp_min_bound; + const int max = output_stage.gemmlowp_max_bound; + + _vector_sum_col = vector_sum_col; + _vector_sum_row = vector_sum_row; + _mm_result = mm_result; + _bias = bias; + _output = output; + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = vector_sum_row != nullptr + && mm_result->info()->num_dimensions() > 1 + && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); + build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + } + // If b_offset == 0, vector_sum_row can be a nullptr + build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); + build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2))); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); + build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multiplier)); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shift)); + build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max)); + + std::string kernel_name("gemmlowp_offset_contribution"); + + // Fuse output stage + if(output_stage.type != GEMMLowpOutputStageType::NONE) + { + kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type); + } + else + { + ARM_COMPUTE_ERROR("GEMMLowpOutputStage can not be NONE!"); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, + bias != nullptr ? bias->info() : nullptr, + output->info(), + a_offset, b_offset); // NOLINT + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name + "_"; + _config_id += support::cpp11::to_string(mm_result->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->info()->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->info()->dimension(2)); +} + +Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + const ITensorInfo *output, + int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + bias != nullptr ? bias->clone().get() : nullptr, + output->clone().get(), + a_offset, b_offset) + .first); // NOLINT + + return Status{}; +} + +void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + // Set window for vector_sum_col + Window win_vector_sum_col = slice; + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row = slice; + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window biases_slice = slice; + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _mm_result, slice); + if(_vector_sum_col != nullptr) + { + add_2D_tensor_argument(idx, _vector_sum_col, win_vector_sum_col); + } + if(_vector_sum_row != nullptr) + { + add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row); + } + if(_bias != nullptr) + { + add_1D_tensor_argument(idx, _bias, biases_slice); + } + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index d403d67173..38e0474dde 100644 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -42,7 +42,7 @@ namespace arm_compute namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max, unsigned int output_3d_depth) + int min, int max) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(max > 255); @@ -58,10 +58,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con if(output->total_size() != 0) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_output_stage_shape(*input, output_3d_depth, true); - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(output_shape); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); } return Status{}; @@ -69,7 +67,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) { - constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_elems_processed_per_iteration = 4; // Configure kernel window Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); @@ -103,15 +101,15 @@ class Coordinates; } // namespace arm_compute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr), _reinterpret_as_3d(false) + : _input(nullptr), _bias(nullptr), _output(nullptr) { } Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max, unsigned int output_3d_depth) + int min, int max) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max, output_3d_depth)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr) ? bias->clone().get() : nullptr, output->clone().get()) @@ -122,22 +120,20 @@ Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max, unsigned int output_3d_depth) + int min, int max) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_output_stage_shape(*input->info(), output_3d_depth, true); - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8).set_tensor_shape(output_shape)); + auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8)); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), - min, max, output_3d_depth)); + min, max)); - _input = input; - _bias = bias; - _output = output; - _reinterpret_as_3d = output_3d_depth > 1; + _input = input; + _bias = bias; + _output = output; // Set the arguments to pass at compile time CLBuildOptions build_opts; @@ -147,7 +143,6 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min)); build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max)); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - build_opts.add_option_if(_reinterpret_as_3d, "-DDST_HEIGHT=" + support::cpp11::to_string(input->info()->tensor_shape().y() / output_3d_depth)); // Create kernel _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down_fixedpoint", build_opts.options())); @@ -177,32 +172,12 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window add_1D_tensor_argument(idx1, _bias, biases_slice); } - if(_reinterpret_as_3d) + do { - // Create output window - Window window_out; - window_out.use_tensor_dimensions(_output->info()->tensor_shape()); - Window collapsed_out = window_out.collapse_if_possible(window_out, 3); - Window slice_out = collapsed.first_slice_window_4D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_4D_tensor_argument(idx1, _output, slice_out); - enqueue(queue, *this, slice); - } - while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out)); - } - else - { - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); - enqueue(queue, *this, slice); - } - while(collapsed.slide_window_slice_3D(slice)); + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx1, _output, slice); + enqueue(queue, *this, slice); } + while(collapsed.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp index 57891131c7..621bd2b54b 100644 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp @@ -63,7 +63,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) { - constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_elems_processed_per_iteration = 4; // Configure kernel window Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp index 9cf5d1fb6a..225c358b20 100644 --- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -59,7 +60,7 @@ std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITens Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1)); + AccessWindowStatic input_access(input, 0, 0, input->dimension(0), input->dimension(1)); AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); bool window_changed = update_window_and_padding(win, input_access, output_access); @@ -115,8 +116,12 @@ void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTens CLBuildOptions build_opts; build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0))); + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); + + std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : ""); + // Create kernel - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); // Configure kernel window auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info()); |