diff options
6 files changed, 160 insertions, 63 deletions
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h index 58400b190b..d9d3e1a4d8 100644 --- a/arm_compute/core/KernelDescriptors.h +++ b/arm_compute/core/KernelDescriptors.h @@ -124,5 +124,27 @@ struct InstanceNormalizationLayerKernelInfo float epsilon; /**< Lower bound value for the normalization. Defaults to 1e-12 */ bool use_mixed_precision; /**< Use mixed precision in case of FP16 execution. Defaults to true */ }; + +struct GEMMLowpReductionKernelInfo +{ + /** Default constructor */ + GEMMLowpReductionKernelInfo() = default; + /** Constructor + * + * @param[in] k Number of matrix columns/rows. + * @param[in] is_reshaped True if the input tensor has been reshaped. + * @param[in] scalar Scalar value to multiply each reduced column/row by. + * @param[in] mul_by_scalar True if each column/row reduction has to be multiplied by a scalar value. + */ + GEMMLowpReductionKernelInfo(int32_t k, bool is_reshaped, int32_t scalar, bool mul_by_scalar) + : k(k), is_reshaped(is_reshaped), scalar(scalar), mul_by_scalar(mul_by_scalar) + { + } + + int32_t k{ 0 }; /**< Number of matrix columns/rows */ + bool is_reshaped{ false }; /**< True if the input tensor has been reshaped */ + int32_t scalar{ 0 }; /**< Scalar value to multiply each reduced column/row by */ + bool mul_by_scalar{ false }; /**< True if each column/row reduction has to be multiplied by a scalar value */ +}; } // namespace arm_compute #endif /* ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h index fb781aea28..1e472f5252 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,9 @@ namespace arm_compute { +// Forward declarations class ITensor; +struct GEMMLowpReductionKernelInfo; /** Common interface for all NEON reduction kernels */ class INEGEMMLowpReductionKernel : public INEKernel @@ -47,18 +49,23 @@ public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 - * @param[in] k Number of matrix A columns (or matrix B rows) - * @param[in] is_reshaped True if the input tensor has been reshaped + * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k Number of matrix columns/rows depending on the type of reduction. + * - is_reshaped True if the matrix has been reshaped. + * - scalar Scalar value to multiply each reduced column/row by. + * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. */ - virtual void configure(const ITensor *input, ITensor *output, int32_t k, bool is_reshaped) = 0; + virtual void configure(const ITensor *input, ITensor *output, const GEMMLowpReductionKernelInfo &info) = 0; protected: const ITensor *_input; ITensor *_output; int32_t _k; bool _is_reshaped; + int32_t _scalar; + bool _mul_by_scalar; }; /** NEON kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. @@ -75,22 +82,28 @@ public: } /** Initialise the kernel's input and output. * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] num_mtx_a_cols Number of matrix A columns - * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4 + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k (num_mtx_a_cols) Number of matrix A columns + * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4 + * - scalar Scalar value to multiply each reduced row by. + * - mul_byscalar True if each reduced column must be multiplied by a scalar value. */ - void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override; + void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] num_mtx_a_cols Number of matrix A columns - * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4 + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k (num_mtx_a_cols) Number of matrix A columns + * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4 + * - scalar Scalar value to multiply each reduced row by. + * - mul_byscalar True if each reduced column must be multiplied by a scalar value. * * @return a status */ - static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4); + static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -118,22 +131,28 @@ public: } /** Initialise the kernel's input and output. * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] num_mtx_b_rows Number of matrix B rows - * @param[in] is_transposed1xW True if the input tensor is transposed 1xW + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k (num_mtx_b_rows) Number of matrix B rows. + * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW. + * - scalar Scalar value to multiply each reduced row by. + * - mul_byscalar True if each reduced row must be multiplied by a scalar value. */ - void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override; + void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] num_mtx_b_rows Number of matrix B rows - * @param[in] is_transposed1xW True if the input tensor is transposed 1xW + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k (num_mtx_b_rows) Number of matrix B rows. + * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW. + * - scalar Scalar value to multiply each reduced row by. + * - mul_byscalar True if each reduced row must be multiplied by a scalar value. * * @return a status */ - static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW); + static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index c87e806d0c..8dc6b88bb0 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -74,7 +74,7 @@ public: * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function. * - * @param[in] a First input tensor (Matrix A or Vector A). Data type supported: BLOAT16/F16/F32 + * @param[in] a First input tensor (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32 * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a * @param[out] d Output tensor. Data type supported: same as @p a @@ -86,7 +86,7 @@ public: void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM. * - * @param[in] a First input tensor info (Matrix or Vector A). Data types supported: BLOAT16/F16/F32 + * @param[in] a First input tensor info (Matrix or Vector A). Data types supported: BFLOAT16/F16/F32 * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a. * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a. * @param[out] output Output tensor info. Data type supported: same as @p a diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index 5368384b19..e7da1006e0 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -66,9 +66,9 @@ public: * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BLOAT16 + * Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BFLOAT16 * @param[out] output Destination tensor. - * Data types supported: Same as @p weights, FP32 if @p weights is BLOAT16 + * Data types supported: Same as @p weights, FP32 if @p weights is BFLOAT16 */ void configure(const ITensor *weights, const ITensor *biases, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights @@ -76,9 +76,9 @@ public: * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BLOAT16 + * Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BFLOAT16 * @param[in] output Destination tensor. - * Data types supported: Same as @p weights FP32 if @p weights is BLOAT16 + * Data types supported: Same as @p weights FP32 if @p weights is BFLOAT16 * * @return an error status */ @@ -140,7 +140,7 @@ private: /** Basic function to compute the convolution layer. This function calls the following NEON kernels/functions: * * -# @ref NEIm2ColKernel - * -# @ref NEGEMM (if the data type is BLOAT16/FP16/FP32) + * -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32) * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED) * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8/QASYMM8_SIGNED) * -# @ref NEArithmeticAdditionKernel (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout) diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp index 374005d897..b7e862c81f 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" @@ -37,26 +38,29 @@ #include <cstddef> #include <cstdint> -using namespace arm_compute; - namespace arm_compute { -class Coordinates; -} // namespace arm_compute - namespace { Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + if(output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); + } return Status{}; } std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped) { const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1; + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, TensorShape(input->dimension(1)), 1, DataType::S32); + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1)); @@ -72,9 +76,14 @@ std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITens Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + if(output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); + } return Status{}; } @@ -82,6 +91,9 @@ std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITens { constexpr unsigned int num_elems_processed_per_iteration = 16; + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, TensorShape(input->dimension(0)), 1, DataType::S32); + // Configure kernel window Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); @@ -98,20 +110,22 @@ std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITens } // namespace INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel() - : _input(), _output(), _k(0), _is_reshaped(false) + : _input(), _output(), _k(0), _is_reshaped(false), _scalar(0), _mul_by_scalar(false) { } -void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) +void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info())); - _input = mtx_a; - _output = vector_sum_row; - _k = num_mtx_a_cols; - _is_reshaped = is_interleaved4x4; + _input = mtx_a; + _output = vector_sum_row; + _k = info.k; + _is_reshaped = info.is_reshaped; + _scalar = info.scalar; + _mul_by_scalar = info.mul_by_scalar; // Configure kernel window auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped); @@ -119,11 +133,10 @@ void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor * INEKernel::configure(win_config.second); } -Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) +Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) { - ARM_COMPUTE_UNUSED(num_mtx_a_cols); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), is_interleaved4x4).first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), info.is_reshaped).first); return Status{}; } @@ -145,11 +158,12 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w Iterator in(_input, win_input); Iterator out(_output, collapsed_window); + const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{}); + if(_is_reshaped) { execute_window_loop(collapsed_window, [&](const Coordinates & id) { - // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}); const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2])); @@ -194,6 +208,12 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w sum_row = wrapper::vaddw(sum_row, a0_d16); } + // Multiply by scalar if necessary + if(_mul_by_scalar) + { + sum_row = wrapper::vmul(sum_row, vec_scalar); + } + auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr()); wrapper::vstore(vector_sum_row, wrapper::vreinterpret(sum_row)); @@ -243,6 +263,12 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w sum_row += wrapper::vgetlane(tmp, 0); #endif // __aarch64__ + // Multiply by scalar if necessary + if(_mul_by_scalar) + { + sum_row *= _scalar; + } + *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row); }, in, out); @@ -269,15 +295,17 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf } } -void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) +void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info())); - _input = mtx_b; - _output = vector_sum_col; - _k = num_mtx_b_rows; - _is_reshaped = is_transposed1xW; + _input = mtx_b; + _output = vector_sum_col; + _k = info.k; + _is_reshaped = info.is_reshaped; + _scalar = info.scalar; + _mul_by_scalar = info.mul_by_scalar; // Configure kernel window auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info()); @@ -285,10 +313,9 @@ void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor * INEKernel::configure(win_config.second); } -Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) +Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) { - ARM_COMPUTE_UNUSED(num_mtx_b_rows); - ARM_COMPUTE_UNUSED(is_transposed1xW); + ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first); @@ -304,6 +331,8 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); + const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{}); + if(_is_reshaped) { Window win_input(collapsed_window); @@ -350,6 +379,15 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); } + // Multiply by scalar if necessary + if(_mul_by_scalar) + { + sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); + sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); + sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); + sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); + } + auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr()); wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); @@ -465,6 +503,15 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const matrix_b += in_b_stride; } + // Multiply by scalar if necessary + if(_mul_by_scalar) + { + sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); + sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); + sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); + sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); + } + auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr()); wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); @@ -495,3 +542,4 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf ARM_COMPUTE_ERROR("Unsupported data type"); } } +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 8c6cee78bb..3417c72735 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" @@ -37,7 +38,8 @@ #include "arm_compute/runtime/TensorAllocator.h" #include "support/MemorySupport.h" -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager) @@ -172,6 +174,9 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, if(!_fused_assembly_path) { + // Build reduction info + const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false); + // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 if(_a_offset != 0) { @@ -184,7 +189,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, } // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false); + _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -196,7 +201,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, _memory_group.manage(&_vector_sum_row); // Configure matrix A reduction kernel - _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false); + _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info); } if(_fuse_output_stage) @@ -418,13 +423,15 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso TensorInfo info_vector_sum_col{}; TensorInfo info_vector_sum_row{}; + const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); + // Validate matrix B reduction kernel only if _a_offset is not equal to 0 if(a_offset != 0) { info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); } // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -433,7 +440,7 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); } if(fuse_output_stage) @@ -580,3 +587,4 @@ void NEGEMMLowpMatrixMultiplyCore::prepare() _is_prepared = true; } } +} // namespace arm_compute |