From 358ca205c9e41f523517ffa55a9057308b736040 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 7 Dec 2017 16:47:52 +0000 Subject: COMPMID-617: Adds CLFullyConnectionLayer validation support Change-Id: I4d2eb9872a3165fdcaa7784596e441cbe563dbc2 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/112577 Tested-by: Jenkins Reviewed-by: Ioan-Cristian Szabo Reviewed-by: Anthony Barbier --- src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp | 89 +++++++---- .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp | 135 +++++++++++------ .../kernels/CLGEMMLowpOffsetContributionKernel.cpp | 134 +++++++++++------ src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp | 104 +++++++++---- .../kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp | 65 +++++--- src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 166 +++++++++++++-------- src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp | 97 ++++++++---- src/core/CL/kernels/CLIm2ColKernel.cpp | 34 ++++- .../kernels/NEGEMMLowpOffsetContributionKernel.cpp | 8 +- 9 files changed, 579 insertions(+), 253 deletions(-) (limited to 'src/core') diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp index 7741f12900..6886f54602 100644 --- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp +++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp @@ -33,8 +33,55 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, + DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y; + bool window_changed = false; + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + window_changed = window_changed || update_window_and_padding(win, input_access); + + // Configure window in case of configured output + if(output->total_size() != 0) + { + AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f); + window_changed = window_changed || update_window_and_padding(win, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel() : _input(nullptr), _output(nullptr) @@ -43,22 +90,13 @@ CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel() void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, - DataType::U16, DataType::S16, DataType::QS16, - DataType::U32, DataType::S32, - DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); - - TensorShape output_shape = input->info()->tensor_shape(); - output_shape.set(0, input->info()->dimension(0) * 4); - output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f)); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info()))); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); _input = input; _output = output; @@ -68,20 +106,9 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name)); // Configure kernel window - const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type()); - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure(win); + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); // Set config_id for enabling LWS tuning _config_id = "interleave4x4_"; @@ -92,6 +119,14 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out _config_id += support::cpp11::to_string(output->info()->dimension(1)); } +Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); + + return Status{}; +} + void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp index 1d9fe4bc01..423592b79c 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp @@ -46,79 +46,114 @@ namespace arm_compute class Coordinates; } // namespace arm_compute -CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr) +namespace { -} +using ElementsProcessed = Steps; -void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed) +Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); if(!is_interleaved_transposed) { - ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1)); } - _input0 = input0; - _input1 = input1; - _output = output; + return Status{}; +} - CLBuildOptions build_opts; +std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, bool is_interleaved_transposed, + ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + + Window win{}; + bool window_changed = false; + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication if(is_interleaved_transposed) { - // Create kernel and set static arguments - build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))); - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_mm_interleaved_transposed", build_opts.options())); - // Configure window - constexpr unsigned int num_elems_processed_per_iteration_x = 16; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; + num_elems_processed_per_iteration_x = 16; + num_elems_processed_per_iteration_y = 4; constexpr unsigned int num_elems_read_per_iteration_input0 = 4; constexpr unsigned int num_elems_read_per_iteration_input1 = 16; - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1); - AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - update_window_and_padding(win, input0_access, input1_access, output_access); + AccessWindowRectangle input0_access(input0, 0, 0, num_elems_read_per_iteration_input0, 1); + AccessWindowRectangle input1_access(input1, 0, 0, num_elems_read_per_iteration_input1, 1); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + window_changed = update_window_and_padding(win, input0_access, input1_access, output_access); - ICLKernel::configure(win); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); } else { // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x - constexpr unsigned int num_elems_processed_per_iteration_x = 16; - const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); - - build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); - build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x)); - build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y)); - - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_mm", build_opts.options())); + num_elems_processed_per_iteration_x = 16; + num_elems_processed_per_iteration_y = std::min(static_cast(output->dimension(1)), 4); // Configure window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y)); - AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1)); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y)); + AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1)); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - update_window_and_padding(win, input0_access, input1_access, output_access); + window_changed = update_window_and_padding(win, input0_access, input1_access, output_access); Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); + coord.set_num_dimensions(output->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape())); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed)); + + _input0 = input0; + _input1 = input1; + _output = output; + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); - ICLKernel::configure(win); + // Create build options + CLBuildOptions build_opts; + std::string kernel_name(" "); + if(is_interleaved_transposed) + { + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))); + kernel_name = "gemmlowp_mm_interleaved_transposed"; } + else + { + build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x())); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y())); + kernel_name = "gemmlowp_mm"; + } + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); // Set config_id for enabling LWS tuning _config_id = "gemmlowp_"; @@ -132,6 +167,20 @@ void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const IC _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } +Status CLGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), + input1->clone().get(), + output->clone().get(), + is_interleaved_transposed, + num_elements_processed) + .first); + + return Status{}; +} + void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp index 2877a74be8..d05939fcf5 100644 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp @@ -44,89 +44,135 @@ namespace arm_compute class Coordinates; } // namespace arm_compute -CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr) +namespace { -} - -void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); // If a_offset == 0, vector_sum_col can be a nullptr if(a_offset != 0) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0)); - - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); } // If b_offset == 0, vector_sum_row can be a nullptr if(b_offset != 0) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1)); - // Validate batches - TensorShape output_shape = mm_result->info()->tensor_shape(); + TensorShape output_shape = mm_result->tensor_shape(); if(output_shape.num_dimensions() > 1) { - TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape(); + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); vector_sum_row_shape.collapse_from(1); output_shape.collapse_from(2); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], + "mm_result tensor must have the same number of batches of output tensor"); if(a_offset != 0) { - TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 - && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); } } - - build_opts.add_option("-DB_OFFSET=" + support::cpp11::to_string(b_offset)); } - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - - // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options())); - - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; + return Status{}; +} +std::pair validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ constexpr unsigned int num_elems_processed_per_iteration = 16; + bool window_changed = false; // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, mm_result_access); + AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, + mm_result_access); if(a_offset != 0) { - AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, vector_sum_col_access); + AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, + vector_sum_col_access); } - if(b_offset != 0) { - AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); - update_window_and_padding(win, vector_sum_row_access); + AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT + window_changed = window_changed || update_window_and_padding(win, + vector_sum_row_access); } - ICLKernel::configure(win); + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel() + : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr) +{ +} + +void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, + a_offset, b_offset)); // NOLINT + + _vector_sum_col = vector_sum_col; + _vector_sum_row = vector_sum_row; + _mm_result = mm_result; + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); + build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + } + // If b_offset == 0, vector_sum_row can be a nullptr + build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, + a_offset, b_offset); // NOLINT + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); +} + +Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + a_offset, b_offset) + .first); // NOLINT + + return Status{}; } void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp index bcf04b0982..6951512167 100644 --- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp @@ -44,6 +44,59 @@ namespace arm_compute class Coordinates; } // namespace arm_compute +namespace +{ +Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + return Status{}; +} +std::pair validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output) +{ + const unsigned int num_elems_processed_per_iteration = 1; + + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1)); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} + +Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + return Status{}; +} + +std::pair validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1)); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel() : _input(), _output() { @@ -51,8 +104,9 @@ ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel() void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info())); _input = mtx_a; _output = vector_sum_row; @@ -64,21 +118,18 @@ void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTens // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options())); - const unsigned int num_elems_processed_per_iteration = 1; - // Configure kernel window - Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1)); - AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - input_access, - output_access); + auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); +} - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape())); +Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get()).first); - ICLKernel::configure(win); + return Status{}; } void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue) @@ -107,8 +158,8 @@ void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueu void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info())); _input = mtx_b; _output = vector_sum_col; @@ -121,21 +172,18 @@ void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTens // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_b_reduction", build_opts.options())); - constexpr unsigned int num_elems_processed_per_iteration = 16; - // Configure kernel window - Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), num_elems_processed_per_iteration), _input->info()->dimension(1)); - AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - input_access, - output_access); + auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); +} - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape())); +Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first); - ICLKernel::configure(win); + return Status{}; } void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp index 015b4f70a4..d5c93dd24a 100644 --- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp @@ -36,6 +36,37 @@ using namespace arm_compute; +namespace +{ +Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1); + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target, + unsigned int &num_elems_processed_per_iteration) +{ + // Select the vector size to use (8 for Bifrost; 16 for Midgard). + num_elems_processed_per_iteration = (gpu_target == GPUTarget::BIFROST) ? 8 : 16; + + // Configure kernel window + Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic biases_access(biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), biases->dimension(1)); + AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, biases_access, accum_access); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel() : _accum(nullptr), _biases(nullptr) { @@ -43,18 +74,21 @@ CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel() void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum); - ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info())); _biases = biases; _accum = accum; // Get the target architecture - GPUTarget arch_target = get_arch_from_target(get_target()); - // Select the vector size to use (8 for Bifrost; 16 for Midgard). - const unsigned int vector_size = (arch_target == GPUTarget::BIFROST) ? 8 : 16; + GPUTarget arch_target = get_arch_from_target(get_target()); + unsigned int vector_size = 0; + + // Configure kernel window + auto win_config = validate_and_configure_window(accum->info(), biases->info(), arch_target, vector_size); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); // Add build options CLBuildOptions build_opts; @@ -65,18 +99,15 @@ void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTe // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts.options())); +} - // Configure kernel window - const unsigned int num_elems_processed_per_iteration = vector_size; - - Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1)); - AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, biases_access, accum_access); +Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target) +{ + unsigned int num_elems_processed_per_iteration = 0; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), biases->clone().get(), gpu_target, num_elems_processed_per_iteration).first); - ICLKernel::configure(win); + return Status{}; } void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index 16706dd748..f51d0f92d4 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -42,6 +42,81 @@ using namespace arm_compute; +namespace +{ +using ElementsProcessed = Steps; + +inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); + if(!is_interleaved_transposed) + { + ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1)); + } + + return Status{}; +} + +inline std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, + bool is_interleaved_transposed, GPUTarget gpu_target, + ElementsProcessed &num_elements_processed) +{ + bool window_changed = false; + Window win{}; + + const DataType data_type = input0->data_type(); + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + + if(is_interleaved_transposed) + { + // Configure kernel window + num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); + num_elems_processed_per_iteration_y = 4; + + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); + AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + window_changed = update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + } + else // The input tensors have not been reshaped + { + // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case. + num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); + num_elems_processed_per_iteration_y = std::min(static_cast(output->dimension(1)), 4); + + // Create kernels according to the architecture, data type and input size. + if(gpu_target == GPUTarget::BIFROST && data_type == DataType::F32) + { + num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000) ? 2 : 4; + } + + // Configure window + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y)); + AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1)); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + window_changed = update_window_and_padding(win, input0_access, input1_access, output_access); + + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape())); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() : _input0(nullptr), _input1(nullptr), _output(nullptr) { @@ -49,13 +124,10 @@ CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - if(!is_interleaved_transposed) - { - ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); - } + ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed)); _input0 = input0; _input1 = input1; @@ -82,14 +154,19 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _lws_hint = cl::NDRange(8, 8); } + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, arch_target, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); + // Create build options CLBuildOptions build_opts; build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fp_pos)); - const bool multiply_alpha = std::abs(1.0f - alpha) > 0.00001f; - // Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications. - if(multiply_alpha) + if(std::abs(1.0f - alpha) > 0.00001f) { build_opts.add_option_if_else(is_data_type_fixed_point(data_type), "-DALPHA=" + support::cpp11::to_string((data_type == DataType::QS8 ? sqcvt_qs8_f32(alpha, fp_pos) : sqcvt_qs16_f32(alpha, fp_pos))), @@ -108,49 +185,19 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen { kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)); } - - // Configure kernel window - const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); - AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - - update_window_and_padding(win, input0_access, input1_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - - ICLKernel::configure(win); } else // The input tensors have not been reshaped { build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); - // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case. - unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); - const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); - // Create kernels according to the architecture, data type and input size. if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32) { // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g. // FC6 and FC7 of AlexNet and VGG-16). - if(input1->info()->dimension(0) <= 1000) - { - // Each work-item processes 2 elements in the X dimension. - num_elems_processed_per_iteration_x = 2; - kernel_name = "gemm_mm_floating_point_f32_bifrost_1000"; - } - else - { - // Each work-item processes 4 elements in the X dimension (as in the default case). - num_elems_processed_per_iteration_x = 4; - kernel_name = "gemm_mm_floating_point_f32_bifrost"; - } + kernel_name = (input1->info()->dimension(0) <= 1000) ? "gemm_mm_floating_point_f32_bifrost_1000" : "gemm_mm_floating_point_f32_bifrost"; + // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels // via exhaustive autotuning over a range of representative layer configurations. _lws_hint = cl::NDRange(4); @@ -164,23 +211,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); kernel_name = "gemm_mm_floating_point"; } - build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y)); - build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x)); - - // Configure window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y)); - AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1)); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - - update_window_and_padding(win, input0_access, input1_access, output_access); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure(win); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y())); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x())); } // Create kernel @@ -198,6 +230,22 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } +Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), + input1->clone().get(), + output->clone().get(), + is_interleaved_transposed, + gpu_target, + num_elements_processed) + .first); + + return Status{}; +} + void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp index 35074f94cf..69a545b76b 100644 --- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp +++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp @@ -33,36 +33,82 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; -void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output) +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, - DataType::U16, DataType::S16, DataType::QS16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, + DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + compute_transpose1xW_with_element_size_shape(*input)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } - TensorShape output_shape{ input->info()->tensor_shape() }; - const size_t transpose_w = 16 / input->info()->element_size(); - output_shape.set(0, input->info()->dimension(1) * transpose_w); - output_shape.set(1, static_cast(std::ceil((input->info()->dimension(0) / static_cast(transpose_w))))); + return Status{}; +} - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration) +{ + num_elems_processed_per_iteration = 16 / input->element_size(); + + const int scale_x = num_elems_processed_per_iteration; + bool window_changed = false; + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + + if((win.x().end() / scale_x) == 0) + { + return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Transposed shape would be 0 in the second dimension"), win); + } + + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, input_access); + + // Configure window in case of configured output + if(output->total_size() != 0) + { + AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); + window_changed = window_changed || update_window_and_padding(win, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape())); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); +void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info()))); - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); - const int scale_x = num_elems_processed_per_iteration; + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); _input = input; _output = output; + // Configure kernel window + unsigned int num_elems_processed_per_iteration = 1; + auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); + /* * Following an example of how the transposition1xW works when the input data type is F32 * @@ -76,20 +122,15 @@ void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *outp // Create kernel std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration); _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name)); +} - // Configure window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - - ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension"); - - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape())); +Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + unsigned int num_elems_processed_per_iteration = 1; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first); - ICLKernel::configure(win); + return Status{}; } void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp index 6514d6cf91..0e9f2c5344 100644 --- a/src/core/CL/kernels/CLIm2ColKernel.cpp +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -39,6 +39,24 @@ using namespace arm_compute; +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Status{}; +} +} // namespace + CLIm2ColKernel::CLIm2ColKernel() : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr) { @@ -46,9 +64,10 @@ CLIm2ColKernel::CLIm2ColKernel() void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); _input = input; _output = output; @@ -184,6 +203,15 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const _config_id += support::cpp11::to_string(output->info()->dimension(1)); } +Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) +{ + ARM_COMPUTE_UNUSED(kernel_dims); + ARM_COMPUTE_UNUSED(conv_info); + ARM_COMPUTE_UNUSED(has_bias); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + return Status{}; +} + void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON(_run_func == nullptr); diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp index f6964002dd..3d41548a6a 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp @@ -71,16 +71,16 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto vector_sum_row_shape.collapse_from(1); output_shape.collapse_from(2); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], + "mm_result tensor must have the same number of batches of output tensor"); if(a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 - && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); } } } -- cgit v1.2.1