From 48b3ef89de5f21a0169d8416e3d54081f82c7bf8 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 14 Oct 2019 19:03:09 +0100 Subject: COMPMID-2577: Fuse bias addition and activation in gemm assembly kernels Change-Id: I7f52112d2d05b1ea3d3f3d4b19b8eafab05d6c44 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/2141 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Pablo Marquez --- arm_compute/runtime/NEON/functions/NEGEMM.h | 36 +++-- .../NEON/functions/NEGEMMAssemblyDispatch.h | 40 ++---- .../NEON/functions/NEGEMMConvolutionLayer.h | 6 - .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 4 + .../functions/assembly/NEGEMMInterleavedWrapper.h | 147 --------------------- 5 files changed, 42 insertions(+), 191 deletions(-) delete mode 100644 arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h (limited to 'arm_compute/runtime/NEON') diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index d947be1ef9..e4d69eb93d 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -24,6 +24,7 @@ #ifndef __ARM_COMPUTE_NEGEMM_H__ #define __ARM_COMPUTE_NEGEMM_H__ +#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" @@ -33,20 +34,27 @@ #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" #include "arm_compute/runtime/Tensor.h" -#include - namespace arm_compute { /** Basic function to execute GEMM on NEON. This function calls the following NEON kernels: * + * If optimized assembly is available: + * -# @ref NEGEMMAssemblyDispatch + * -# @ref NEActivationLayer (if alpha != 1.0) + * Else: * -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix) * -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix) * -# @ref NEGEMMMatrixMultiplyKernel - * -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0) + * In both cases: + * -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once) + * Else: + * -# @ref NEArithmeticAdditionKernel (if c != nullptr and is reshaped once and not optimized assembly in place) * + * -# @ref NEActivationLayer (if activation is specified in GEMMInfo) */ class NEGEMM : public IFunction { @@ -103,13 +111,21 @@ private: NEGEMMMatrixMultiplyKernel _mm_kernel; NEGEMMAssemblyDispatch _asm_glue; NEGEMMMatrixAdditionKernel _ma_kernel; - Tensor _tmp_a; - Tensor _tmp_b; - const ITensor *_original_b; - bool _run_vector_matrix_multiplication; - bool _run_addition; - bool _reshape_b_only_on_first_run; - bool _is_prepared; + NEActivationLayer _alpha_scale_func; + NEArithmeticAdditionKernel _add_bias_kernel; + NEActivationLayer _activation_func; + + Tensor _tmp_a; + Tensor _tmp_b; + Tensor _tmp_d; + const ITensor *_original_b; + bool _run_vector_matrix_multiplication; + bool _run_alpha_scale; + bool _run_addition; + bool _run_bias_addition; + bool _run_activation; + bool _reshape_b_only_on_first_run; + bool _is_prepared; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEGEMM_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h index 83e495e695..20d189e76b 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h @@ -59,29 +59,10 @@ public: }; private: - /** ACL Function */ - std::unique_ptr _function; - - /** If supported create the ACL function corresponding to the GemmMethod provided to process the other passed parameters - * - * @param[in] method GemmMethod to use to perform the matrix multiplication. - * @param[in] a Input tensor (Matrix A). - * @param[in] b Input tensor (Matrix B). - * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations - * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] alpha Scalar multiplier to apply to AB matrix product. - * @param[in] beta Scalar multiplier to apply to input D matrix before adding product. - * @param[in] gemm_info GEMM meta-data - * - * @return True if the method is supported and the function was successfully created, false otherwise. - */ - bool create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info); - /** Interface for the arm_gemm fallback */ - std::unique_ptr _arm_gemm; - MemoryGroup _memory_group; /**< Function memory group */ - std::shared_ptr _memory_manager; /**< Copy of the memory manager used to create the memory group to be used when instantiating new functions */ - IWeightsManager *_weights_manager; /**< Pointer to the weights manager */ + std::unique_ptr _arm_gemm; + MemoryGroup _memory_group; /**< Function memory group */ + IWeightsManager *_weights_manager; /**< Pointer to the weights manager */ public: /** If supported create an ACL function else fallback to the arm_gemm function. * @@ -89,11 +70,9 @@ public: * @param[in] b Input tensor (Matrix B) * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] alpha Scalar multiplier to apply to AB matrix product. - * @param[in] beta Scalar multiplier to apply to input D matrix before adding product. * @param[in] gemm_info GEMM meta-data */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info); + void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info); /** Indicates whether or not this function can be used to process the given parameters. * @@ -101,13 +80,18 @@ public: * @param[in] b Input tensor info (Matrix B) * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] alpha Scalar multiplier to apply to AB matrix product. - * @param[in] beta Scalar multiplier to apply to input D matrix before adding product. * @param[in] gemm_info GEMM meta-data * * @return a status. */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info); + static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info); + /** Checks if activation is supported by the gemm assembly dispatcher + * + * @param[in] activation Activation to check + * + * @return True if activation is supported else false + */ + static bool is_activation_supported(const ActivationLayerInfo &activation); /** Was the function successfully configured ? * * @return True if the function is configured and ready to run diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index dccc35f0af..3e551abf5a 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -26,7 +26,6 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" #include "arm_compute/core/NEON/kernels/NECol2ImKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h" @@ -34,7 +33,6 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" @@ -250,8 +248,6 @@ private: NEGEMM _mm_gemm; NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; NECol2ImKernel _col2im_kernel; - NEActivationLayer _activationlayer_function; - NEArithmeticAdditionKernel _add_bias_kernel; NEReshapeLayer _reshape_layer; const ITensor *_original_weights; @@ -263,11 +259,9 @@ private: DataLayout _data_layout; - bool _append_bias; bool _skip_im2col; bool _skip_col2im; bool _is_quantized; - bool _is_activationlayer_enabled; bool _is_prepared; }; } // namespace arm_compute diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index 5b6a0dd943..12c120934e 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -24,6 +24,7 @@ #ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ #define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ +#include "NEActivationLayer.h" #include "arm_compute/core/NEON/INEKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" @@ -46,6 +47,7 @@ class ITensor; * -# @ref NEGEMMTranspose1xWKernel * -# @ref NEGEMMLowpMatrixMultiplyKernel * -# @ref NEGEMMLowpOffsetContributionKernel + * -# @ref NEActivationLayer * * otherwise if the DOT product instruction is available: * @@ -113,6 +115,7 @@ private: NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel; NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel; + NEActivationLayer _activation_func; Tensor _vector_sum_col; Tensor _vector_sum_row; Tensor _tmp_a; @@ -127,6 +130,7 @@ private: bool _reshape_b_only_on_first_run; bool _is_prepared; bool _fuse_output_stage; + bool _run_activation; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h b/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h deleted file mode 100644 index 695dcd5b6e..0000000000 --- a/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVEDWRAPPER_H__ -#define __ARM_COMPUTE_NEGEMMINTERLEAVEDWRAPPER_H__ - -#include "arm_compute/core/NEON/kernels/assembly/Helpers.h" -#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h" -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h" -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h" -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/IScheduler.h" -#include "arm_compute/runtime/IWeightsManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/Tensor.h" - -#include - -namespace arm_compute -{ -// Forward declarations -class ITensor; - -/** Buffer manager used when reshaping B on the fly - * - * The typical workflow is: - * - lock_to_reshape_if_needed() - * - If the previous lock was successful: mark_as_reshaped() - * - wait_for_reshaping() wait for the reshaping to be complete - * - mark_as_unused() once the thread is done using this given buffer. - * - * Calls for different indices might be interleaved, however the calls for a given index must always be in that order. - */ -class IBufferManager -{ -public: - /** Lock a buffer for the given index if it's available else return - * - * @param[in] index Index of the buffer to lock - * - * @return True if the buffer has been successfully locked, false if it's already reshaped / being reshaped. - */ - virtual bool lock_to_reshape_if_needed(unsigned int index) = 0; - /** Mark a buffer previously locked as reshaped - * - * @pre The thread calling this function must have locked the given buffer through lock_to_reshape_if_needed() - * - * @param[in] index Index of the buffer to mark as reshaped - */ - virtual void mark_as_reshaped(unsigned int index) = 0; - /** Block until the given buffer is marked as reshaped - * - * @param[in] index Index of the buffer - */ - virtual void wait_for_reshaping(unsigned int index) = 0; - /** Mark a reshaped buffer as unused - * - * Once all the users have marked a buffer as unused then it goes back to being free - */ - virtual void mark_as_unused(unsigned int index) = 0; - - /** Number of buffers used internally - * - * @return The number of buffers used by the manager. - */ - virtual unsigned int num_buffers() const = 0; - /** Default destructor */ - virtual ~IBufferManager() = default; -}; - -/** Equivalent to arm_gemm::GemmInterleaved but using Compute Library types. - */ -class NEGEMMInterleavedWrapper : public IFunction -{ -public: - NEGEMMInterleavedWrapper(std::shared_ptr memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); - ~NEGEMMInterleavedWrapper() = default; - - NEGEMMInterleavedWrapper(const NEGEMMInterleavedWrapper &) = delete; - NEGEMMInterleavedWrapper &operator=(const NEGEMMInterleavedWrapper &) = delete; - - /** Initialise the kernel's input and output. - * - * @note The input and output tensor must have the same dimensions - * - * @param[in] a Input tensor (Matrix A) - * @param[in] b Input tensor (Matrix B) - * @param[out] c Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] alpha Scalar multiplier to apply to AB matrix product. - * @param[in] beta Scalar multiplier to apply to input C matrix before adding product. - * @param[in] gemm_info GEMM meta-data - */ - void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info); - - // Inherited methods overridden: - void run() override; - void prepare() override; - -private: - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - bool _is_prepared{ false }; - bool _pretranspose_b{ false }; - Window _block_walker{}; - Window _batch_window{}; - const ITensor *_a{ nullptr }; - const ITensor *_b{ nullptr }; - ITensor *_c{ nullptr }; - Tensor _transformed_b{}; - Tensor _transformed_a{}; - Tensor _tmp_c{}; - INEGEMMWrapperKernel::Params _params{}; - BlockSizes _block_sizes{}; - std::unique_ptr _prepare_b{ nullptr }; - std::unique_ptr _transform_a{ nullptr }; - std::unique_ptr _matrix_multiply{ nullptr }; - std::unique_ptr _buffer_manager{ nullptr }; - std::vector _a_workloads{}; - std::vector _b_workloads{}; - std::vector _mm_workloads{}; - std::vector _workloads{}; - std::string _tag{}; -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDWRAPPER_H__ */ -- cgit v1.2.1