From 19835e591cb0b66a0f5000ae1505bf299e50337d Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Tue, 30 Jan 2018 13:35:54 +0000 Subject: COMPMID-882 - Optimizing GEMMLowp on OpenCL reshaping matrices This new optimization allows to achieve 36.3 % of MAC utilisation on Mate 9 @ 1GHz. The performance have been reported here https://confluence.arm.com/display/MLENG/GEMMLowp+performance%3A+ACL+18.02 Change-Id: I71b6a217068763dfdc11bbf3574ee0eb94f93679 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118531 Reviewed-by: Anthony Barbier Tested-by: Jenkins --- .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp | 88 ++++++++++++++++++---- .../kernels/CLGEMMLowpOffsetContributionKernel.cpp | 12 ++- src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp | 1 + 3 files changed, 84 insertions(+), 17 deletions(-) (limited to 'src/core/CL/kernels') diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp index 2f96724210..ae498ec8a7 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" #include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/AccessWindowTranspose.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -34,6 +35,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "support/ToolchainSupport.h" #include @@ -41,6 +43,7 @@ #include using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; namespace arm_compute { @@ -51,14 +54,53 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed) +Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + if(!is_interleaved_transposed) { ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1)); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + } + } + else + { + const int m = reshape_info.m(); + const int n = reshape_info.n(); + const int k = reshape_info.k(); + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + + TensorShape tensor_shape0{ input0->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + TensorShape tensor_shape1{ input1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); + const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast(n)); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast(m)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + } } return Status{}; @@ -76,16 +118,14 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication if(is_interleaved_transposed) { - // Configure window - num_elems_processed_per_iteration_x = 16; - num_elems_processed_per_iteration_y = 4; - constexpr unsigned int num_elems_read_per_iteration_input0 = 4; - constexpr unsigned int num_elems_read_per_iteration_input1 = 16; + // Configure kernel window + num_elems_processed_per_iteration_x = 4; + num_elems_processed_per_iteration_y = 4; win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input0_access(input0, 0, 0, num_elems_read_per_iteration_input0, 1); - AccessWindowRectangle input1_access(input1, 0, 0, num_elems_read_per_iteration_input1, 1); + AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); + AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); window_changed = update_window_and_padding(win, input0_access, input1_access, output_access); @@ -122,10 +162,18 @@ CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel() { } -void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed) +void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed)); + + // Output tensor auto inizialitation if not yet initialized + TensorShape tensor_shape{ input0->info()->tensor_shape() }; + tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0)); + tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1)); + + auto_init_if_empty(*output->info(), tensor_shape, 1, DataType::S32, 1, QuantizationInfo()); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info)); _input0 = input0; _input1 = input1; @@ -146,8 +194,18 @@ void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const IC std::string kernel_name(" "); if(is_interleaved_transposed) { + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + + // Note: The computation tile has the x dimension equal to 4 which is less than the transpose_width (16) + // In order to access correctly the elements from the transposed matrix B, we need to pass + // the correct step which is calculated as (16 * mult_transpose1xW_width) / 4) + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))); - kernel_name = "gemmlowp_mm_interleaved_transposed"; + build_opts.add_option("-DTRANSPOSE1XW_WIDTH_STEP=" + support::cpp11::to_string(4 * mult_transpose1xW_width)); + build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height)); + + kernel_name = "gemmlowp_mm_interleaved_transposed_" + string_from_target(arch_target); } else { @@ -171,10 +229,10 @@ void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const IC _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } -Status CLGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed) +Status CLGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info) { ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get(), diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp index d05939fcf5..221a1566b9 100644 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -91,7 +91,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto std::pair validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset) { - constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_elems_processed_per_iteration = 4; bool window_changed = false; // Configure kernel window @@ -160,6 +160,14 @@ void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const I a_offset, b_offset); // NOLINT ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "gemmlowp_offset_contribution_"; + _config_id += support::cpp11::to_string(mm_result->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->info()->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->info()->dimension(2)); } Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp index 63aed6df32..24d218760e 100644 --- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp +++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp @@ -113,6 +113,7 @@ void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *outp // Create build options CLBuildOptions build_opts; + build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size())); build_opts.add_option("-DTRANSPOSE_W=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width)); -- cgit v1.2.1