From bb081cac4f386eb6db6e9927fce27c7027dd7be5 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 8 Nov 2018 10:22:01 +0000 Subject: COMPMID-1751: Remove output_3d_depth from NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint Change-Id: I1d5bc4d24059917f9ddef0873dd3043b1f2320a8 --- ...antizeDownInt32ToUint8ScaleByFixedPointKernel.h | 20 +- .../runtime/NEON/functions/NEGEMMLowpOutputStage.h | 19 +- .../kernels/NEGEMMLowpOffsetContributionKernel.cpp | 307 ++++++++++++--------- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 69 ++--- src/runtime/NEON/functions/NEGEMM.cpp | 2 +- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 51 ++-- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 82 ++++-- .../NEON/functions/NEGEMMLowpOutputStage.cpp | 8 +- 8 files changed, 303 insertions(+), 255 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h index 8412fa229f..15d09e27ff 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h @@ -72,24 +72,21 @@ public: * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, * Along with @p min, this value can be used to implement "rectified linear unit" activation functions - * @param[in] output_3d_depth (Optional) Depth of output in 3D (Defaults to 1) */ - void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min = 0, int max = 0, unsigned int output_3d_depth = 1); + void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, * Along with @p min, this value can be used to implement "rectified linear unit" activation functions - * @param[in] output_3d_depth (Optional) Depth of output in 3D (Defaults to 1) * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0, unsigned int output_3d_depth = 1); + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -117,7 +114,6 @@ private: int _result_offset_after_shift; int _min; int _max; - unsigned int _output_3d_depth; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h index f38ecdbbd2..53b91b35b6 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h @@ -131,24 +131,21 @@ public: * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, * Along with @p min, this value can be used to implement "rectified linear unit" activation functions - * @param[in] output_3d_depth (Optional) Depth of output in 3D (Defaults to 1) */ - void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min = 0, int max = 0, unsigned int output_3d_depth = 1); + void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint * - * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, * Along with @p min, this value can be used to implement "rectified linear unit" activation functions - * @param[in] output_3d_depth (Optional) Depth of output in 3D (Defaults to 1) * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0, unsigned int output_3d_depth = 1); + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */ \ No newline at end of file diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp index af84d024d5..33a5b4ace3 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp @@ -62,16 +62,24 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto if(b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1)); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = mm_result->tensor_shape(); if(output_shape.num_dimensions() > 1) { + const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); vector_sum_row_shape.collapse_from(1); - output_shape.collapse_from(2); + output_shape.collapse_from(output_batch_idx); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], "mm_result tensor must have the same number of batches of output tensor"); if(a_offset != 0) @@ -117,77 +125,23 @@ std::pair validate_and_configure_window(ITensorInfo *mm_result, Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } -} // namespace - -NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true) -{ -} - -void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT - a_offset, b_offset)); // NOLINT - - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _a_offset = a_offset; - _b_offset = b_offset; - _k_offset = a_offset * b_offset * k; - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - // Check if vector_sum_col_shape should be slidden or not - // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1; - } - - // Configure kernel window - auto win_config = validate_and_configure_window(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT - a_offset, b_offset); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); -} - -Status NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, - int32_t a_offset, int32_t b_offset) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), - vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, - a_offset, b_offset) - .first); // NOLINT - - return Status{}; -} -void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info) +template +void run_offset_contribution(const Window &window, + ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, + int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col) { - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + Window collapsed_window = window.collapse_if_possible(window, Window::DimZ); - Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimZ); + const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; + const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; - if(_a_offset != 0 && _b_offset != 0) // true, true + if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true { // Set window for vector_sum_col Window win_vector_sum_col(collapsed_window); win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - if(!_slide_vector_sum_col) - { - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - } + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); // Set window for vector_sum_row Window win_vector_sum_row(collapsed_window); @@ -195,42 +149,49 @@ void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadI win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col); - Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row); - Iterator mm_result(_mm_result, window); + Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); + Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); + Iterator mm_result_it(mm_result, window); - const size_t sum_row_stride_y = _vector_sum_row->info()->strides_in_bytes().y(); + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; execute_window_loop(collapsed_window, [&](const Coordinates & id) { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + // Compute the leftover term due to a_offset. int32x4x4_t a_offset_term_s32 = { { - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 0), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 4), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 8), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 12) + vld1q_s32(vector_sum_col_ptr + 0), + vld1q_s32(vector_sum_col_ptr + 4), + vld1q_s32(vector_sum_col_ptr + 8), + vld1q_s32(vector_sum_col_ptr + 12) } }; - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset); + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); // Compute the leftover term due to b_offset. - int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast(vector_sum_row.ptr() + id.z() * sum_row_stride_y) + id.y()); - b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, _b_offset); + int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + + (id.z() % depth_input) * height_input); + b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset); // Add a_offset_term_s32 and b_offset_term_s32 int32x4x4_t offset_term_s32 = { { - vdupq_n_s32(_k_offset), - vdupq_n_s32(_k_offset), - vdupq_n_s32(_k_offset), - vdupq_n_s32(_k_offset) + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset) } }; @@ -242,10 +203,10 @@ void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadI int32x4x4_t in_s32 = { { - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 0), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 4), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 8), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 12) + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12) } }; @@ -256,39 +217,44 @@ void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadI in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); // Store the result with the offset contribution - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 0, in_s32.val[0]); - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 4, in_s32.val[1]); - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 8, in_s32.val[2]); - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 12, in_s32.val[3]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0, in_s32.val[0]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4, in_s32.val[1]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8, in_s32.val[2]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12, in_s32.val[3]); }, - vector_sum_col, vector_sum_row, mm_result); + vector_sum_col_it, vector_sum_row_it, mm_result_it); } - else if((_a_offset == 0) && (_b_offset != 0)) // false, true + else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); + // Set window for vector_sum_row Window win_vector_sum_row(collapsed_window); win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row); - Iterator mm_result(_mm_result, window); + Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); + Iterator mm_result_it(mm_result, window); - const size_t sum_row_stride_y = _vector_sum_row->info()->strides_in_bytes().y(); + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); execute_window_loop(window, [&](const Coordinates & id) { + const int batch_id = id.z() / depth_input; + // Compute the leftover term due to b_offset. - int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast(vector_sum_row.ptr() + id.z() * sum_row_stride_y) + id.y()); - b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, _b_offset); + int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + + (id.z() % depth_input) * height_input); + b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset); int32x4x4_t in_s32 = { { - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 0), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 4), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 8), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 12) + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12) } }; @@ -299,51 +265,54 @@ void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadI in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32); // Store the result with the offset contribution - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 0, in_s32.val[0]); - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 4, in_s32.val[1]); - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 8, in_s32.val[2]); - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 12, in_s32.val[3]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0, in_s32.val[0]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4, in_s32.val[1]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8, in_s32.val[2]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12, in_s32.val[3]); }, - vector_sum_row, mm_result); + vector_sum_row_it, mm_result_it); } - else if((_a_offset != 0) && (_b_offset == 0)) // true, false + else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false { // Set window for vector_sum_col Window win_vector_sum_col(collapsed_window); win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - if(!_slide_vector_sum_col) - { - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - } + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col); - Iterator mm_result(_mm_result, window); + Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); + Iterator mm_result_it(mm_result, window); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; execute_window_loop(window, [&](const Coordinates & id) { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + // Compute the leftover term due to a_offset. int32x4x4_t a_offset_term_s32 = { { - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 0), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 4), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 8), - vld1q_s32(reinterpret_cast(vector_sum_col.ptr()) + 12) + vld1q_s32(vector_sum_col_ptr + 0), + vld1q_s32(vector_sum_col_ptr + 4), + vld1q_s32(vector_sum_col_ptr + 8), + vld1q_s32(vector_sum_col_ptr + 12) } }; - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset); + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); int32x4x4_t in_s32 = { { - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 0), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 4), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 8), - vld1q_s32(reinterpret_cast(mm_result.ptr()) + 12) + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8), + vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12) } }; @@ -354,12 +323,12 @@ void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadI in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); // Store the result with the offset contribution - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 0, in_s32.val[0]); - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 4, in_s32.val[1]); - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 8, in_s32.val[2]); - vst1q_s32(reinterpret_cast(mm_result.ptr()) + 12, in_s32.val[3]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 0, in_s32.val[0]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 4, in_s32.val[1]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 8, in_s32.val[2]); + vst1q_s32(reinterpret_cast(mm_result_it.ptr()) + 12, in_s32.val[3]); }, - vector_sum_col, mm_result); + vector_sum_col_it, mm_result_it); } else // false, false { @@ -367,3 +336,77 @@ void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadI return; } } +} // namespace + +NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel() + : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true) +{ +} + +void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT + a_offset, b_offset)); // NOLINT + + _vector_sum_col = vector_sum_col; + _vector_sum_row = vector_sum_row; + _mm_result = mm_result; + _a_offset = a_offset; + _b_offset = b_offset; + _k_offset = a_offset * b_offset * k; + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + // Check if vector_sum_col_shape should be slidden or not + // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1; + } + + // Configure kernel window + auto win_config = validate_and_configure_window(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT + a_offset, b_offset); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Status NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + a_offset, b_offset) + .first); // NOLINT + + return Status{}; +} + +void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = _vector_sum_row != nullptr + && _mm_result->info()->num_dimensions() > 1 + && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x(); + + if(reinterpret_as_3d) + { + run_offset_contribution(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col); + } + else + { + run_offset_contribution(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col); + } +} \ No newline at end of file diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index 7cd50cc5a0..024c4f8863 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -43,8 +43,7 @@ using namespace arm_compute; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max, unsigned int output_3d_depth) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(max > 255); @@ -60,10 +59,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con if(output->total_size() != 0) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_output_stage_shape(*input, output_3d_depth); - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(output_shape); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, input); } return Status{}; @@ -76,6 +73,9 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen // For this reason num_elems_processed_per_iteration is set to 1 constexpr unsigned int num_elems_processed_per_iteration = 1; + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8)); + // Configure kernel window Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); @@ -146,15 +146,15 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window ARM_COMPUTE_UNUSED(min_u8); ARM_COMPUTE_UNUSED(max_u8); - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const unsigned int gemm_3d_height = _input->info()->tensor_shape().y() / _output_3d_depth; + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator in(_input, win_collapsed); + Iterator out(_output, win_collapsed); if(_bias != nullptr) { Window win_biases; @@ -164,16 +164,6 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window Iterator bias(_bias, win_biases); execute_window_loop(win_collapsed, [&](const Coordinates & id) { - // Calculate output coordinates - Coordinates out_coords = id; - if(_output_3d_depth != 1) - { - out_coords.set(Window::DimY, id.y() % gemm_3d_height); - out_coords.set(Window::DimZ, id.y() / gemm_3d_height); - out_coords.set(3, id.z()); - } - uint8_t *out_ptr = _output->ptr_to_element(out_coords); - // Compute 16 elements per iteration int x = window_start_x; for(; x <= (window_end_x - window_step_x); x += window_step_x) @@ -204,7 +194,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - vst1q_u8(out_ptr + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8)); + vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8)); } // Compute left-over elements @@ -217,26 +207,16 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window in_value += bias_value; // Finalize and store the result - *(out_ptr + x) = finalize_quantization(vdupq_n_s32(in_value), _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast(_min), - static_cast(_max)); + *(out.ptr() + x) = finalize_quantization(vdupq_n_s32(in_value), _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast(_min), + static_cast(_max)); } }, - in, bias); + in, out, bias); } else { execute_window_loop(win_collapsed, [&](const Coordinates & id) { - // Calculate output coordinates - Coordinates out_coords = id; - if(_output_3d_depth != 1) - { - out_coords.set(Window::DimY, id.y() % _output_3d_depth); - out_coords.set(Window::DimZ, id.y() / _output_3d_depth); - out_coords.set(3, id.z()); - } - uint8_t *out_ptr = _output->ptr_to_element(out_coords); - // Compute 16 elements per iteration int x = window_start_x; for(; x <= (window_end_x - window_step_x); x += window_step_x) @@ -251,7 +231,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window } }; - vst1q_u8(out_ptr + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8)); + vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8)); } // Compute left-over elements @@ -260,30 +240,24 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window const int32x4_t in_s32 = vld1q_dup_s32(reinterpret_cast(in.ptr()) + x); // Finalize and store the result - *(out_ptr + x) = finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast(_min), static_cast(_max)); + *(out.ptr() + x) = finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast(_min), static_cast(_max)); } }, - in); + in, out); } } NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() - : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0), _output_3d_depth(1) + : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0) { } void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max, unsigned int output_3d_depth) + int result_offset_after_shift, int min, int max) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_output_stage_shape(*input->info(), output_3d_depth); - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8).set_tensor_shape(output_shape)); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), - min, max, output_3d_depth)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max)); _input = input; _bias = bias; @@ -293,7 +267,6 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const _result_offset_after_shift = result_offset_after_shift; _min = min; _max = max; - _output_3d_depth = output_3d_depth; // Configure kernel window auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info()); @@ -305,10 +278,10 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run; } -Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max, unsigned int output_3d_depth) +Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max, output_3d_depth)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr) ? bias->clone().get() : nullptr, output->clone().get()) diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 82b9cb80ae..72a3e80330 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -139,7 +139,7 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output); ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index d02c63cfb3..24332014aa 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -101,6 +101,9 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col)); + const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, + gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */); + if(_is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() @@ -111,7 +114,7 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset)); weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset)); - _mm_gemmlowp.configure(input, weights, nullptr, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)); + _mm_gemmlowp.configure(input, weights, nullptr, output, gemm_info); // Revert back QuantizatioInfo as input and weights could be used in other convolution layers input->info()->set_quantization_info(input_quantization_info); @@ -120,8 +123,7 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w else { // Configure matrix multiply function - _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, gemm_3d_depth, - _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */)); + _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info); } } @@ -129,7 +131,8 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens { const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, skip_im2col); + const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, + gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */); if(is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() @@ -256,15 +259,24 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig } // Create temporary GEMM output tensor in case we cannot skip col2im - if(!_skip_col2im) + if(!_skip_col2im || _is_quantized) { - // Calculate GEMM output shape - TensorShape shape_gemm = _im2col_output.info()->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input. const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type; + TensorShape shape_gemm; + + if(_is_quantized && _skip_col2im) + { + shape_gemm = output->info()->tensor_shape(); + } + else + { + // Calculate GEMM output shape + shape_gemm = _im2col_output.info()->tensor_shape(); + shape_gemm.set(0, mat_weights_cols); + shape_gemm.set(1, conv_w * conv_h); + } + // FIXME: input->clone() doesn't work with subtensors for grouped convolutions. TensorInfo info_gemm(shape_gemm, 1, gemm_data_type); info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout()); @@ -321,8 +333,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig _is_activationlayer_enabled = false; } - _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset, min_activation, max_activation, - skip_reshape ? conv_h : 1); + _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset, min_activation, max_activation); } if(!_skip_col2im && _data_layout == DataLayout::NCHW) @@ -336,7 +347,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig _tmp_output.allocator()->allocate(); } - if(!_skip_col2im) + if(!_skip_col2im || _is_quantized) { _gemm_output.allocator()->allocate(); } @@ -464,18 +475,20 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI } // Create temporary GEMM output tensor in case we cannot skip col2im + const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type; if(!skip_col2im) { TensorShape shape_gemm = gemm_input_to_use->tensor_shape(); shape_gemm.set(0, mat_weights_cols); shape_gemm.set(1, conv_w * conv_h); - const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type; - // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input. info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type); - info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout()); - - gemm_output_to_use = &info_gemm; } + else + { + info_gemm = TensorInfo(output->tensor_shape(), 1, gemm_data_type); + } + info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout()); + gemm_output_to_use = &info_gemm; ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 0, skip_im2col)); @@ -516,7 +529,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI } // Validate output stage for quantized case - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, min_activation, max_activation, skip_reshape ? conv_h : 0); + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, min_activation, max_activation); } // Validate Col2Im/ReshapeLayer diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 16ee3d07fd..4b026948b9 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -190,42 +190,68 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1), - "The output matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0), - "The output matrix must have the same number of columns as the matrix B"); - ARM_COMPUTE_UNUSED(gemm_info); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the output tensor as 3D"); - int32_t a_offset = a->quantization_info().offset; - int32_t b_offset = b->quantization_info().offset; - bool run_vector_matrix_multiplication = a->dimension(1) < 2; + int32_t a_offset = a->quantization_info().offset; + int32_t b_offset = b->quantization_info().offset; + const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); - if(!run_vector_matrix_multiplication) + // Check if we need to run the optimized assembly kernel + const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, output, 1.f, 0.f, reshape_b_only_on_first_run)); + + if(run_optimised) { - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorShape shape_tmp_a = a->tensor_shape(); - shape_tmp_a.set(0, a->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorShape shape_tmp_b = b->tensor_shape(); - shape_tmp_b.set(0, b->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); - - TensorInfo info_a(shape_tmp_a, 1, a->data_type()); - TensorInfo info_b(shape_tmp_b, 1, b->data_type()); - - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output)); + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); + if(gemm_info.depth_output_gemm3d() != 0) + { + if(gemm_info.reinterpret_input_as_3d()) + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + } + } } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); + + const bool run_vector_matrix_multiplication = a->dimension(1) < 2; + if(!run_vector_matrix_multiplication) + { + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + TensorShape shape_tmp_a = a->tensor_shape(); + shape_tmp_a.set(0, a->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + TensorShape shape_tmp_b = b->tensor_shape(); + shape_tmp_b.set(0, b->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); + + TensorInfo info_a(shape_tmp_a, 1, a->data_type()); + TensorInfo info_b(shape_tmp_b, 1, b->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output)); + } } TensorInfo info_vector_sum_col, info_vector_sum_row; diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp index d270a77fc2..ce69fa0bfd 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp @@ -43,14 +43,14 @@ Status NEGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *inpu } void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max, unsigned int output_3d_depth) + int result_offset_after_shift, int min, int max) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, output_3d_depth); + k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); _kernel = std::move(k); } -Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max, unsigned int output_3d_depth) +Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) { - return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max, output_3d_depth); + return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max); } \ No newline at end of file -- cgit v1.2.1