From 4b90865ab985d571f70c60583cdfb8c7a65f1670 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 18 Oct 2018 10:21:02 +0100 Subject: COMPMID-1413 - Improve the performance of GEMMLowp with 8 bit dot product on OpenCL COMPMID-1424 - Add dot product support for CLDepthwise QASYMM8 3x3 NHWC non-unit stride With this patch we are able to improve the performance of MobileNet v1-qasymm8 by 37 % Tried to use the dot product instruction in CLDepthwise QASYMM8 3x3 NHWC non-unit stride but I have not seen any benefit (maybe because we have few arithemtic operation and we do not have more load instructions). However Depthwise convolution has been improved by 30% Change-Id: Id768a99c2e53a04276707e427af5d0ec93419ada Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/155082 Tested-by: bsgcomp Reviewed-by: Georgios Pinitas --- .../kernels/CLGEMMLowpOffsetContributionKernel.cpp | 51 +++++++++++++++++----- 1 file changed, 40 insertions(+), 11 deletions(-) (limited to 'src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp') diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp index 3888353ee7..d348f2c06d 100644 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp @@ -46,11 +46,18 @@ class Coordinates; namespace { -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); + } + // If a_offset == 0, vector_sum_col can be a nullptr if(a_offset != 0) { @@ -64,11 +71,11 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Validate input ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row != nullptr && vector_sum_row->dimension(0) != mm_result->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = mm_result->tensor_shape(); if(output_shape.num_dimensions() > 1) @@ -96,7 +103,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, +std::pair validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, int32_t a_offset, int32_t b_offset) { constexpr unsigned int num_elems_processed_per_iteration = 4; @@ -119,28 +126,37 @@ std::pair validate_and_configure_window(ITensorInfo *mm_result, window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access); } + if(bias != nullptr) + { + AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]); + window_changed = window_changed || update_window_and_padding(win, bias_access); + } + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr) + : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr) { } -void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, + int32_t b_offset) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, + bias != nullptr ? bias->info() : nullptr, a_offset, b_offset)); // NOLINT _vector_sum_col = vector_sum_col; _vector_sum_row = vector_sum_row; _mm_result = mm_result; + _bias = bias; // Check if input is a 3D reinterpretation const bool reinterpret_as_3d = vector_sum_row != nullptr @@ -161,20 +177,24 @@ void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const I build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1))); build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2))); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + std::string kernel_name("gemmlowp_offset_contribution"); // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options())); + _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); // Configure kernel window auto win_config = validate_and_configure_window(mm_result->info(), vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, + bias != nullptr ? bias->info() : nullptr, a_offset, b_offset); // NOLINT ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); // Set config_id for enabling LWS tuning - _config_id = "gemmlowp_offset_contribution_"; + _config_id = kernel_name + "_"; _config_id += support::cpp11::to_string(mm_result->info()->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(mm_result->info()->dimension(1)); @@ -182,13 +202,14 @@ void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const I _config_id += support::cpp11::to_string(mm_result->info()->dimension(2)); } -Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, +Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + bias != nullptr ? bias->clone().get() : nullptr, a_offset, b_offset) .first); // NOLINT @@ -214,6 +235,10 @@ void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQu win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Window biases_slice = slice; + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + do { unsigned int idx = 0; @@ -226,7 +251,11 @@ void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQu { add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row); } - enqueue(queue, *this, slice); + if(_bias != nullptr) + { + add_1D_tensor_argument(idx, _bias, biases_slice); + } + enqueue(queue, *this, slice, lws_hint()); } while(collapsed.slide_window_slice_3D(slice)); } -- cgit v1.2.1