From eaefd002a5d6509dd5f12e98b538c99b33c2c1ee Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Fri, 20 Jul 2018 17:49:35 +0100 Subject: COMPMID-1419: Make NEGEMMAssemblyDispatch dynamically typed instead of templated This makes it easier to integrate in GEMMLowpMatrixMultiplyCore Change-Id: Ibf80803f016a2e6a24d943ffafb50b48f04ec545 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140868 Reviewed-by: Georgios Pinitas Tested-by: Jenkins --- arm_compute/runtime/NEON/functions/NEGEMM.h | 2 +- .../NEON/functions/NEGEMMAssemblyDispatch.h | 89 ++++++++-------------- .../NEON/functions/NEGEMMConvolutionLayer.h | 2 +- .../NEGEMMLowpAssemblyMatrixMultiplyCore.h | 15 ++-- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 3 +- .../NEON/functions/NEWinogradConvolutionLayer.h | 4 +- 6 files changed, 42 insertions(+), 73 deletions(-) (limited to 'arm_compute/runtime/NEON') diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index 523f1d33a1..36c9587969 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -85,7 +85,7 @@ private: NEGEMMInterleave4x4Kernel _interleave_kernel; NEGEMMTranspose1xWKernel _transpose_kernel; NEGEMMMatrixMultiplyKernel _mm_kernel; - NEGEMMAssemblyDispatchF32 _asm_glue; + NEGEMMAssemblyDispatch _asm_glue; NEGEMMMatrixAdditionKernel _ma_kernel; Tensor _tmp_a; Tensor _tmp_b; diff --git a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h index 1c9ecb088e..382ef1caba 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h @@ -35,7 +35,6 @@ namespace arm_compute { /** Assembly kernel glue */ -template class NEGEMMAssemblyDispatch : public IFunction { public: @@ -43,12 +42,21 @@ public: NEGEMMAssemblyDispatch(std::shared_ptr memory_manager = nullptr); /** Prevent instances of this class from being copy constructed */ - NEGEMMAssemblyDispatch(const NEGEMMAssemblyDispatch &) = delete; + NEGEMMAssemblyDispatch(const NEGEMMAssemblyDispatch &) = delete; /** Prevent instances of this class from being copied */ - NEGEMMAssemblyDispatch &operator=(const NEGEMMAssemblyDispatch &) = delete; - NEGEMMAssemblyDispatch(NEGEMMAssemblyDispatch &&) = default; - NEGEMMAssemblyDispatch &operator=(NEGEMMAssemblyDispatch &&) = default; - ~NEGEMMAssemblyDispatch() = default; + NEGEMMAssemblyDispatch &operator=(const NEGEMMAssemblyDispatch &) = delete; + NEGEMMAssemblyDispatch(NEGEMMAssemblyDispatch &&) = default; + NEGEMMAssemblyDispatch &operator=(NEGEMMAssemblyDispatch &&) = default; + ~NEGEMMAssemblyDispatch() = default; + + class IFallback + { + public: + virtual void run() = 0; + virtual void prepare() = 0; + virtual bool is_configured() const = 0; + virtual ~IFallback() = default; + }; private: /** ACL Function */ @@ -68,53 +76,9 @@ private: */ bool create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint); - //Fallback: use arm_gemm's AssemblyGemm: - class Fallback - { -#ifndef DOXYGEN_SKIP_THIS - public: - /** Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel. - * The call to set_arrays is needed to deal with the input sizes containing batches (dims > 2) - */ - void run(); - void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs &args, MemoryGroup &memory_group); - void prepare(); - bool is_configured() const; -#endif /* DOXYGEN_SKIP_THIS */ - - private: - /** Allocate a workspace tensor. - * - * @param[in] workspace_size Size to allocate. - * @param[in] memory_group Tensor memory group. - * @param[in] alignment Workspace memory alignment. - */ - void allocate_workspace(size_t workspace_size, MemoryGroup *memory_group, size_t alignment); - - /** Assembly Gemm kernel */ - std::unique_ptr> _gemm_kernel_asm{ nullptr }; - /** Optimised NEON kernel */ - std::unique_ptr _optimised_kernel{ nullptr }; - /** Input A */ - const ITensor *_a - { - nullptr - }; - /** Input B */ - const ITensor *_b - { - nullptr - }; - /** Output */ - ITensor *_d{ nullptr }; - /** GEMM workspace */ - Tensor _workspace{}; - /** Pre-transpose tensor */ - Tensor _pretranspose{}; - /** Prepared flag */ - bool _is_prepared{ false }; - } _arm_gemm; /**< Fallback in case ACL doesn't have a function */ - MemoryGroup _memory_group; /**< Function memory group */ + /** Interface for the arm_gemm fallback */ + std::unique_ptr _arm_gemm; + MemoryGroup _memory_group; /**< Function memory group */ public: /** If supported create an ACL function else fallback to the arm_gemm function. * @@ -126,6 +90,19 @@ public: * @param[in] pretranspose_hint Can the B tensor can be pretransposed (ie shared across invocations)? */ void configure(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint); + + /** Indicates whether or not this function can be used to process the given parameters. + * + * @param[in] a Input tensor (Matrix A) + * @param[in] b Input tensor (Matrix B) + * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. + * @param[in] alpha Scalar multiplier to apply to AB matrix product. + * @param[in] beta Scalar multiplier to apply to input D matrix before adding product. + * @param[in] pretranspose_hint Can the B tensor can be pretransposed (ie shared across invocations)? + * + * @return a status. + */ + static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, float alpha, float beta, bool pretranspose_hint); /** Was the function successfully configured ? * * @return True if the function is configured and ready to run @@ -137,11 +114,5 @@ public: void run() override; }; -/** Float 32 assembly dispatch kernel */ -using NEGEMMAssemblyDispatchF32 = NEGEMMAssemblyDispatch; -/** Uint 8 to Uint 32 assembly dispatch kernel */ -using NEGEMMAssemblyDispatchU8U32 = NEGEMMAssemblyDispatch; -/** Int 8 to Int 32 assembly dispatch kernel */ -using NEGEMMAssemblyDispatchS8S32 = NEGEMMAssemblyDispatch; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index 1564b6c983..8f41462b0b 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -169,7 +169,7 @@ private: private: MemoryGroup _memory_group; - NEGEMMAssemblyDispatchF32 _asm_glue; + NEGEMMAssemblyDispatch _asm_glue; NEIm2ColKernel _input_im2col_kernel; NEGEMMInterleave4x4Kernel _input_interleave_kernel; NEConvolutionLayerReshapeWeights _reshape_weights; diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h index b6672d7584..27be34d1f8 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h @@ -58,14 +58,13 @@ public: void run() override; private: - MemoryGroup _memory_group; - NEGEMMAssemblyDispatchU8U32 _asm_glue_unsigned; - NEGEMMAssemblyDispatchS8S32 _asm_glue_signed; - std::unique_ptr _mm_kernel; - std::unique_ptr _mtx_a_reshape_kernel; - std::unique_ptr _mtx_b_reshape_kernel; - Tensor _tmp_a; - Tensor _tmp_b; + MemoryGroup _memory_group; + NEGEMMAssemblyDispatch _asm_glue; + std::unique_ptr _mm_kernel; + std::unique_ptr _mtx_a_reshape_kernel; + std::unique_ptr _mtx_b_reshape_kernel; + Tensor _tmp_a; + Tensor _tmp_b; }; } #endif /*__ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index 96ac7bb7e0..3db76f423c 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -98,8 +98,7 @@ public: private: MemoryGroup _memory_group; - NEGEMMAssemblyDispatchU8U32 _asm_glue_unsigned; - NEGEMMAssemblyDispatchS8S32 _asm_glue_signed; + NEGEMMAssemblyDispatch _asm_glue; std::unique_ptr _mm_kernel; std::unique_ptr _mtx_a_reshape_kernel; std::unique_ptr _mtx_b_reshape_kernel; diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h index 384fbf893b..5da63311e0 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h @@ -43,7 +43,7 @@ class ITensor; * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method ) * -# @ref NEWinogradLayerTransformInputKernel * -# @ref NEWinogradLayerTransformOutputKernel - * -# @ref NEGEMMAssemblyDispatchF32 + * -# @ref NEGEMMAssemblyDispatch * -# @ref CPPPermute (three times: weights, input and output) * * @note Some Winograd configurations (i.e. F(2x2, 5x5), F(4x4, 5x5)) are supported only with enable_fast_math = true @@ -103,7 +103,7 @@ public: private: MemoryGroup _memory_group; - NEGEMMAssemblyDispatchF32 _asm_glue; + NEGEMMAssemblyDispatch _asm_glue; std::unique_ptr _transform_input_kernel; std::unique_ptr _transform_output_kernel; std::unique_ptr _transform_weights_kernel; -- cgit v1.2.1