From 37d080f2f11cfd734104b76512e1fb191486216e Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 21 Jun 2019 18:43:12 +0100 Subject: COMPMID-2378: Sanitize GEMM configuration for NEON Change-Id: I7859b82b2059e14685f8792424648ac5eacd67f1 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/1418 Comments-Addressed: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Michalis Spyrou Tested-by: Arm Jenkins --- arm_compute/core/Dimensions.h | 23 +++++++- .../NEON/kernels/assembly/INEGEMMWrapperKernel.h | 18 +++--- .../NEGEMMInterleavedMatrixMultiplyWrapper.h | 55 +++++++++++-------- .../assembly/NEGEMMInterleavedTransformAWrapper.h | 38 +++++++------ arm_compute/core/Types.h | 64 ++++++++++++++++------ arm_compute/core/WindowIterator.h | 11 +++- 6 files changed, 142 insertions(+), 67 deletions(-) (limited to 'arm_compute/core') diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h index 0a9264f6b0..9c38c60779 100644 --- a/arm_compute/core/Dimensions.h +++ b/arm_compute/core/Dimensions.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -166,6 +166,27 @@ public: collapse(num_dimensions() - start, start); } + /** Remove dimension of a given index + * + * @note If index is greater than the number of dimensions no operation is performed + * + * @param[in] idx Dimension index to remove + */ + void remove(size_t idx) + { + ARM_COMPUTE_ERROR_ON(_num_dimensions < 1); + if(idx >= _num_dimensions) + { + return; + } + + std::copy(_id.begin() + idx + 1, _id.end(), _id.begin() + idx); + _num_dimensions--; + + // Make sure all empty dimensions are filled with 0 + std::fill(_id.begin() + _num_dimensions, _id.end(), 0); + } + /** Returns a read/write iterator that points to the first element in the dimension array. * * @return an iterator. diff --git a/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h index 63178a738a..352f73d7f1 100644 --- a/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h +++ b/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,7 +45,7 @@ public: unsigned int multis{ 0 }; /**< Number of "multi" GEMMs (unique A, B and C). */ }; - static Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c); + static Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info); /** Constructor */ INEGEMMWrapperKernel(); @@ -61,13 +61,14 @@ public: * * @note The input and output tensor must have the same dimensions * - * @param[in] a Input tensor (Matrix A) - * @param[in] b Input tensor (Matrix B) - * @param[out] c Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] alpha Scalar multiplier to apply to AB matrix product. - * @param[in] beta Scalar multiplier to apply to input C matrix before adding product. + * @param[in] a Input tensor (Matrix A) + * @param[in] b Input tensor (Matrix B) + * @param[out] c Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. + * @param[in] alpha Scalar multiplier to apply to AB matrix product. + * @param[in] beta Scalar multiplier to apply to input C matrix before adding product. + * @param[in] gemm_info GEMM meta-data */ - void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta); + void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -95,6 +96,7 @@ protected: const ITensor *_b; ITensor *_c; Params _params; + GEMMInfo _gemm_info; private: Window _window3d; diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h index e2b849aa3d..40b6f5da39 100644 --- a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h +++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h @@ -95,31 +95,32 @@ class NEGEMMInterleavedMatrixMultiplyWrapperTemplate : public NEGEMMInterleavedM public: /** Configure the matrix multiplication: C = alpha * A * B + beta * C * - * @param[in] prepared_a Already reshaped matrix A. - * @param[in] transformed_b Already reshaped matrix B. - * @param[out] tmp_c Temporary buffer to be used to store intermediate results. - * @param[in,out] c Result matrix C. - * @param[in] block_walker Window containing iteration information for the M and batch dimensions. - * @param[in] block_sizes Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes). - * @param[in] params M, N, K sizes. - * @param[in] is_pretransposed Is B also pretransposed ? - * @param[in] alpha Alpha value - * @param[in] beta Beta value - * @param[in] max_num_threads Maximum number of threads that might be used for the calculations. + * @param[in] prepared_a Already reshaped matrix A. + * @param[in] transformed_b Already reshaped matrix B. + * @param[out] tmp_c Temporary buffer to be used to store intermediate results. + * @param[in,out] c Result matrix C. + * @param[in] block_walker Window containing iteration information for the M and batch dimensions. + * @param[in] block_sizes Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes). + * @param[in] params M, N, K sizes. + * @param[in] gemm_info GEMM meta-data + * @param[in] alpha Alpha value + * @param[in] beta Beta value + * @param[in] max_num_threads Maximum number of threads that might be used for the calculations. */ void configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker, const BlockSizes &block_sizes, - const INEGEMMWrapperKernel::Params ¶ms, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads) + const INEGEMMWrapperKernel::Params ¶ms, const GEMMInfo &gemm_info, float alpha, float beta, unsigned int max_num_threads) { - _prepared_a = prepared_a; - _transformed_b = transformed_b; - _tmp_c = tmp_c; - _c = c; - _block_walker = block_walker; - _block_sizes = block_sizes; - _params = params; - _b_is_pretransposed = b_is_pretransposed; - _alpha = alpha; - _beta = beta; + _prepared_a = prepared_a; + _transformed_b = transformed_b; + _tmp_c = tmp_c; + _c = c; + _block_walker = block_walker; + _block_sizes = block_sizes; + _params = params; + _b_is_pretransposed = gemm_info.pretranpose_B(); + _reinterpret_c_as_3d = gemm_info.depth_output_gemm3d() != 0; + _alpha = alpha; + _beta = beta; auto_init_if_empty(*_tmp_c->info(), c->info()->clone()->set_tensor_shape(TensorShape{ _block_sizes.x_block * strategy::out_height(), max_num_threads })); } @@ -133,6 +134,14 @@ public: TensorAccessor c(*_c); TensorAccessor tmp_c(*_tmp_c); + // Handle 3d output re-interpretation + if(_reinterpret_c_as_3d) + { + Strides c_strides_as_3d = _c->info()->strides_in_bytes(); + c_strides_as_3d.remove(Window::DimZ); + c.set_strides(c_strides_as_3d); + } + int prev_batch = -1; typename strategy::operand_type *a_ptr = nullptr; auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id) @@ -216,9 +225,9 @@ private: INEGEMMWrapperKernel::Params _params{}; Window _block_walker{}; bool _b_is_pretransposed{ false }; + bool _reinterpret_c_as_3d{ false }; typename strategy::result_type _alpha{}; typename strategy::result_type _beta{}; }; - } // namespace arm_compute #endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDMATRIXMULTIPLYWRAPPER_H__ */ diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h index 5d6cd02398..b18d327339 100644 --- a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h +++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h @@ -87,20 +87,22 @@ class NEGEMMInterleavedTransformAWrapperTemplate : public NEGEMMInterleavedTrans public: /** Configure the reshape A routine. * - * @param[in] a Input matrix A. - * @param[out] transformed_a Reshaped matrix A. - * @param[in] transpose_a Also transpose A ? - * @param[in] block_walker Window representing the layout of the matrix's blocks - * @param[in] params M, N, K sizes. + * @param[in] a Input matrix A. + * @param[out] transformed_a Reshaped matrix A. + * @param[in] transpose_a Also transpose A ? + * @param[in] reinterpret_a_as_3d Re-interpret as 3D ? + * @param[in] block_walker Window representing the layout of the matrix's blocks + * @param[in] params M, N, K sizes. */ - void configure(const ITensor *a, ITensor *transformed_a, bool transpose_a, const Window &block_walker, const INEGEMMWrapperKernel::Params ¶ms) + void configure(const ITensor *a, ITensor *transformed_a, bool transpose_a, bool reinterpret_a_as_3d, const Window &block_walker, const INEGEMMWrapperKernel::Params ¶ms) { - _a = a; - _transformed_a = transformed_a; - _transpose_a = transpose_a; - _Ksize = params.K; - _Msize = params.M; - _k_multi_window = block_walker.shift_dimensions(1); // block_walker contains (M,K,Multi) --> shift by 1 to get rid of the "M" dimension + _a = a; + _transformed_a = transformed_a; + _transpose_a = transpose_a; + _reinterpret_a_as_3d = reinterpret_a_as_3d; + _Ksize = params.K; + _Msize = params.M; + _k_multi_window = block_walker.shift_dimensions(1); // block_walker contains (M,K,Multi) --> shift by 1 to get rid of the "M" dimension } // Inherited methods overridden: @@ -110,12 +112,12 @@ public: TensorAccessor a(*_a); TensorAccessor transformed_a(*_transformed_a); - if(_a->info()->data_layout() == DataLayout::NHWC) + // Handle 3d input re-interpretation + if(_reinterpret_a_as_3d) { - // In the case of NHWC we want to interpret the output shape as 3D. Thus, the batch stride for A is - // the relevant multiple of the row stride. - const size_t nhwc_batch_stride = _a->info()->strides_in_bytes().y() * _Msize; - a.set_stride(2, nhwc_batch_stride); + Strides a_strides_as_3d = _a->info()->strides_in_bytes(); + a_strides_as_3d.remove(Window::DimZ); + a.set_strides(a_strides_as_3d); } unsigned int last_m = 0; @@ -164,8 +166,8 @@ private: unsigned int _Msize{ 0 }; unsigned int _Ksize{ 0 }; bool _transpose_a{ false }; + bool _reinterpret_a_as_3d{ false }; Window _k_multi_window{}; }; - } // namespace arm_compute #endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDTRANSFORMAWRAPPER_H__ */ diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index ad679d6786..b4d94eced4 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1765,9 +1765,17 @@ class GEMMInfo { public: /** Default constructor */ - GEMMInfo() - : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(true), _depth_output_gemm3d(0), _reinterpret_input_as_3d(false), _retain_internal_weights(false), _gemmlowp_output_stage(), - _fp_mixed_precision(false), _broadcast_bias(false) + GEMMInfo() noexcept + : _is_a_reshaped(false), + _is_b_reshaped(false), + _reshape_b_only_on_first_run(true), + _depth_output_gemm3d(0), + _reinterpret_input_as_3d(false), + _retain_internal_weights(false), + _gemmlowp_output_stage(), + _fp_mixed_precision(false), + _broadcast_bias(false), + _pretranpose_B(true) { } /** Constructor @@ -1785,10 +1793,17 @@ public: * @param[in] broadcast_bias (Optional) Broadcast the shape of the bias tensor from a vector to a matrix. */ GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false, - GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool broadcast_bias = false) - : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run), _depth_output_gemm3d(depth_output_gemm3d), - _reinterpret_input_as_3d(reinterpret_input_as_3d), _retain_internal_weights(retain_internal_weights), _gemmlowp_output_stage(gemmlowp_output_stage), _fp_mixed_precision(fp_mixed_precision), - _broadcast_bias(broadcast_bias) + GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool broadcast_bias = false) noexcept + : _is_a_reshaped(is_a_reshaped), + _is_b_reshaped(is_b_reshaped), + _reshape_b_only_on_first_run(reshape_b_only_on_first_run), + _depth_output_gemm3d(depth_output_gemm3d), + _reinterpret_input_as_3d(reinterpret_input_as_3d), + _retain_internal_weights(retain_internal_weights), + _gemmlowp_output_stage(gemmlowp_output_stage), + _fp_mixed_precision(fp_mixed_precision), + _broadcast_bias(broadcast_bias), + _pretranpose_B(reshape_b_only_on_first_run) { } /** Flag which specifies if the matrix A has been reshaped @@ -1865,17 +1880,34 @@ public: { return _broadcast_bias; }; + /** Flag which specifies whether b should be pre-transposed if supported. + * + * @return True if b should be pre-transposed else false. + */ + bool pretranpose_B() const + { + return _pretranpose_B; + }; + /** Set pre-transpose b flag + * + * @param[in] flag Flag to set + */ + void set_pretranpose_B(bool flag) + { + _pretranpose_B = flag; + } private: - const bool _is_a_reshaped; - const bool _is_b_reshaped; - const bool _reshape_b_only_on_first_run; - const int _depth_output_gemm3d; - const bool _reinterpret_input_as_3d; - const bool _retain_internal_weights; - const GEMMLowpOutputStageInfo _gemmlowp_output_stage; - const bool _fp_mixed_precision; - const bool _broadcast_bias; + bool _is_a_reshaped; + bool _is_b_reshaped; + bool _reshape_b_only_on_first_run; + int _depth_output_gemm3d; + bool _reinterpret_input_as_3d; + bool _retain_internal_weights; + GEMMLowpOutputStageInfo _gemmlowp_output_stage; + bool _fp_mixed_precision; + bool _broadcast_bias; + bool _pretranpose_B; }; /** Winograd information */ diff --git a/arm_compute/core/WindowIterator.h b/arm_compute/core/WindowIterator.h index 32d6293a5a..15289b6d69 100644 --- a/arm_compute/core/WindowIterator.h +++ b/arm_compute/core/WindowIterator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -86,6 +86,15 @@ public: _strides[dim] = size; } + /** Manually set the strides + * + * @param[in] strides Strides to set + */ + void set_strides(const Strides &strides) + { + _strides = strides; + } + /** Returns a pointer to the element at coordinates (x,y,z,w) * * @param[in] x X coordinates -- cgit v1.2.1