From 72219330fd85b1271e714d4ba894d6d8e26340c9 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 5 Jun 2018 14:56:06 +0100 Subject: COMPMID-1145: (API) Introduce prepare() stage (NEON/CL/GLES) Change-Id: I5b46764f9c3154ec3e3b9c951cc9e6dfbcb81dfb Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134255 Reviewed-by: Anthony Barbier Tested-by: Jenkins Reviewed-by: Pablo Tello Reviewed-by: Michele DiGiorgio --- arm_compute/runtime/NEON/AssemblyHelper.h | 52 +++++++++++++++------- .../runtime/NEON/functions/NEConvolutionLayer.h | 1 + .../runtime/NEON/functions/NEDeconvolutionLayer.h | 2 + .../NEON/functions/NEDepthwiseConvolutionLayer.h | 3 +- .../NEDepthwiseSeparableConvolutionLayer.h | 3 +- .../runtime/NEON/functions/NEFullyConnectedLayer.h | 7 +-- arm_compute/runtime/NEON/functions/NEGEMM.h | 15 +++++-- .../NEON/functions/NEGEMMConvolutionLayer.h | 2 + .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 12 ++++- .../NEON/functions/NELocallyConnectedLayer.h | 3 +- .../NEON/functions/NEWinogradConvolutionLayer.h | 3 +- 11 files changed, 77 insertions(+), 26 deletions(-) (limited to 'arm_compute/runtime/NEON') diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h index 3aa43ec96e..c4ba1a584e 100644 --- a/arm_compute/runtime/NEON/AssemblyHelper.h +++ b/arm_compute/runtime/NEON/AssemblyHelper.h @@ -51,7 +51,7 @@ public: using TypeResult = TypeOutput; /** Default constructor. */ AssemblyKernelGlue() - : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _workspace(nullptr), _pretranspose(nullptr) + : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _workspace(nullptr), _pretranspose(nullptr), _is_prepared(false) { } /** Assembly Gemm */ @@ -76,6 +76,31 @@ public: ITensor *_workspace; /** Pre-transpose tensor */ ITensor *_pretranspose; + /** Prepared flag */ + bool _is_prepared; + + /** Runs a preparation step, usually for pre-transposing matrix b */ + void prepare() + { + // Pretranspose B if required + if(_gemm_kernel_asm->B_pretranspose_required()) + { + const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput); + const auto in1_ptr = reinterpret_cast(_b->buffer()); + const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput); + + // Forcing 128-byte alignment (required by 32-bit kernels) + const unsigned int alignment = 128; + void *raw_ptr = reinterpret_cast(_pretranspose->buffer()); + size_t space = _pretranspose->info()->total_size(); + void *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space); + ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr); + _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b); + _b->mark_as_unused(); + } + + _is_prepared = true; + } /** Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel. * The call to set_arrays is needed to deal with the input sizes containing batches (dims > 2) @@ -102,28 +127,25 @@ public: const auto in1_ptr = reinterpret_cast(_b->buffer()); auto out_ptr = reinterpret_cast(_d->buffer()); - // Set workspace if needed + // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads if(_workspace != nullptr) { _gemm_kernel_asm->set_working_space(reinterpret_cast(_workspace->buffer())); + const unsigned int window_size = _gemm_kernel_asm->get_window_size(); + unsigned int num_threads = NEScheduler::get().num_threads(); + if(window_size < num_threads) + { + num_threads = window_size; + _gemm_kernel_asm->set_nthreads(num_threads); + } } + // Prepare assembly kernel + prepare(); + // Set gemm parameters _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d); - // Pretranspose B if required - if(_gemm_kernel_asm->B_pretranspose_required()) - { - // Forcing 128-byte alignment (required by 32-bit kernels) - const unsigned int alignment = 128; - void *raw_ptr = reinterpret_cast(_pretranspose->buffer()); - size_t space = _pretranspose->info()->total_size(); - void *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space); - ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b); - _b->mark_as_unused(); - } - // Schedule assembly kernel NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX); } diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index ff41f0c985..e143814a4e 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -112,6 +112,7 @@ public: const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); // Inherited methods overridden: void run() override; + void prepare() override; private: std::shared_ptr _memory_manager; diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h index 66c6d427ba..3e527168c1 100644 --- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h @@ -108,6 +108,7 @@ public: // Inherited methods overridden: void run() override; + void prepare() override; private: MemoryGroup _memory_group; @@ -117,6 +118,7 @@ private: ITensor *_input; PadStrideInfo _info; std::pair _inner_border; + bool _is_prepared; }; } // arm_compute #endif /* __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index b80fb7f2c8..aa4cace7c2 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -122,6 +122,7 @@ public: // Inherited methods overriden: void run() override; + void prepare() override; private: NEDepthwiseIm2ColKernel _im2col_kernel; @@ -135,7 +136,7 @@ private: Tensor _weights_reshaped; Tensor _v2mm_output; Tensor _output_reshaped; - bool _is_first_run; + bool _is_prepared; bool _is_quantized; const ITensor *_original_weights; }; diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h index 0562c07515..99e93ccece 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -70,6 +70,7 @@ public: // Inherited methods overriden: void run() override; + void prepare() override; private: NEDepthwiseConvolutionLayer _depthwise_conv; diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h index 071eecc3f7..2739f5ebef 100644 --- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h @@ -127,22 +127,23 @@ public: //Inherited methods override void run() override; + void prepare() override; private: MemoryGroup _memory_group; NEIm2ColKernel _im2col_kernel; - NEFullyConnectedLayerReshapeWeights _reshape_weights_kernel; + NEFullyConnectedLayerReshapeWeights _reshape_weights_function; NEGEMMInterleave4x4Kernel _interleave4x4_kernel; NEGEMMMatrixMultiplyKernel _mm_kernel; NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; Tensor _im2col_output; Tensor _interleave4x4_output; Tensor _reshape_weights_output; - bool _are_weights_reshaped; + const ITensor *_original_weights; bool _is_batched_fc_layer; bool _linearize_input; bool _accumulate_biases; - const ITensor *_original_weights; + bool _is_prepared; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index e2263c2307..5d108b2c14 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -53,7 +53,14 @@ class NEGEMM : public IFunction public: /** Constructor */ NEGEMM(std::shared_ptr memory_manager = nullptr); - + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMM(const NEGEMM &) = delete; + /** Default move constructor */ + NEGEMM(NEGEMM &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMM &operator=(const NEGEMM &) = delete; + /** Default move assignment operator */ + NEGEMM &operator=(NEGEMM &&) = default; /** Initialise the kernel's inputs, output * * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. @@ -72,6 +79,7 @@ public: // Inherited methods overridden: void run() override; + void prepare() override; private: MemoryGroup _memory_group; @@ -84,10 +92,11 @@ private: Tensor _tmp_b; Tensor _workspace; Tensor _B_pretransposed; + const ITensor *_original_b; bool _run_vector_matrix_multiplication; bool _run_addition; - bool _is_first_run; bool _reshape_b_only_on_first_run; + bool _is_prepared; }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_NEGEMM_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index d64fd9e771..7075becf75 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -153,6 +153,7 @@ public: // Inherited methods overridden: void run() override; + void prepare() override; private: /** Configures the appropriate matrix multiply routine @@ -197,6 +198,7 @@ private: bool _is_interleaved; bool _is_activationlayer_enabled; bool _skip_im2col; + bool _is_prepared; }; } #endif /* __ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index adcddb8263..f32eb3c757 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -56,6 +56,14 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction public: /** Constructor */ NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete; + /** Default move constructor */ + NEGEMMLowpMatrixMultiplyCore(NEGEMMLowpMatrixMultiplyCore &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMLowpMatrixMultiplyCore &operator=(const NEGEMMLowpMatrixMultiplyCore &) = delete; + /** Default move assignment operator */ + NEGEMMLowpMatrixMultiplyCore &operator=(NEGEMMLowpMatrixMultiplyCore &&) = default; /** Initialise the kernel's inputs, output * * @note GEMM_LOWP: low precision GEMM kernel @@ -86,6 +94,7 @@ public: // Inherited methods overridden void run() override; + void prepare() override; private: MemoryGroup _memory_group; @@ -103,12 +112,13 @@ private: Tensor _tmp_b; Tensor _workspace; Tensor _B_pretranspose; + const ITensor *_original_b; int32_t _a_offset; int32_t _b_offset; bool _run_vector_matrix_multiplication; bool _dot_product_path; - bool _is_first_run; bool _reshape_b_only_on_first_run; + bool _is_prepared; }; } #endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h index 18cd27414e..7d1f124bb3 100644 --- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h @@ -90,6 +90,7 @@ public: // Inherited methods overridden: void run() override; + void prepare() override; private: MemoryGroup _memory_group; @@ -100,7 +101,7 @@ private: Tensor _input_im2col_reshaped; Tensor _weights_reshaped; Tensor _gemm_output; - bool _is_first_run; + bool _is_prepared; const ITensor *_original_weights; }; } diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h index 55921f78f3..c1260977c0 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h @@ -74,6 +74,7 @@ public: // Inherited methods overridden: void run() override; + void prepare() override; /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer * @@ -122,7 +123,7 @@ private: const ITensor *_input; const ITensor *_weights; ITensor *_output; - bool _reshaped_kernel; + bool _is_prepared; bool _is_activationlayer_enabled; }; } -- cgit v1.2.1