From d7316eb877cc4ff8573219374335e917b19a0203 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Wed, 16 Jun 2021 11:14:41 +0100 Subject: Port NEGEMMConv2d to memory injecting interface Resolves: COMPMID-4506, COMPMID-4570 Change-Id: I6d37a06da141f1fcfcaa8525322a319cb0234791 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5824 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- arm_compute/runtime/NEON/functions/NEGEMM.h | 15 +- arm_compute/runtime/NEON/functions/NEGEMMConv2d.h | 5 +- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 63 +--- .../NEON/functions/NEWinogradConvolutionLayer.h | 15 +- src/core/helpers/MemoryHelpers.h | 2 +- src/runtime/CL/functions/CLGEMM.cpp | 14 +- src/runtime/NEON/functions/NEGEMM.cpp | 53 +++- src/runtime/NEON/functions/NEGEMMConv2d.cpp | 74 ++++- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 350 ++++++++++++--------- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 5 - src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp | 77 +++-- src/runtime/cpu/operators/CpuGemmDirectConv2d.h | 40 ++- .../operators/internal/CpuGemmAssemblyDispatch.cpp | 265 +++++----------- .../operators/internal/CpuGemmAssemblyDispatch.h | 20 +- tests/validation/NEON/ConvolutionLayer.cpp | 95 ++++++ 15 files changed, 560 insertions(+), 533 deletions(-) diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index 6fa30bd545..6c5be0eb5e 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -32,6 +32,7 @@ #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/Tensor.h" +#include "src/core/helpers/MemoryHelpers.h" #include @@ -105,14 +106,7 @@ public: void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM. * - * @param[in] a First input tensor info (Matrix or Vector A). Data types supported: BFLOAT16/F16/F32 - * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a. - * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a. - * @param[out] output Output tensor info. Data type supported: same as @p a - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of matrix C - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should happen only for the first run + * Similar to @ref NEGEMM::configure() * * @return a status */ @@ -146,7 +140,10 @@ private: bool _reshape_b_only_on_first_run; bool _is_prepared; - ITensorPack _asm_glue_tensors{}; + ITensorPack _asm_glue_run_pack; + ITensorPack _asm_glue_prep_pack; + WorkspaceData _asm_glue_workspace; + experimental::MemoryRequirements _aux_mem_req; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEGEMM_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h index f39ce4dfa3..53ceb6d978 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h @@ -29,15 +29,12 @@ #include "arm_compute/runtime/IMemoryManager.h" #include + namespace arm_compute { // Forward declarations class ITensor; class ITensorInfo; -namespace cpu -{ -class CpuGemmAssemblyDispatch; -} /** Basic function to compute the convolution layer. This function calls the following kernels/functions: * diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index dc9783f9eb..ff888760e1 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -24,32 +24,15 @@ #ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H #define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H -#include "NEActivationLayer.h" -#include "arm_compute/core/ITensorPack.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/Tensor.h" #include namespace arm_compute { class ITensor; -class NEConvertQuantizedSignednessKernel; -class NEGEMMInterleave4x4Kernel; -class NEGEMMLowpMatrixMultiplyKernel; -class NEGEMMLowpOffsetContributionKernel; -class NEGEMMLowpOffsetContributionOutputStageKernel; -class NEGEMMLowpMatrixAReductionKernel; -class NEGEMMLowpMatrixBReductionKernel; -class NEGEMMTranspose1xWKernel; -namespace cpu -{ -class CpuGemmAssemblyDispatch; -} - /** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available: * * -# @ref NEGEMMInterleave4x4Kernel @@ -119,14 +102,7 @@ public: void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore * - * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise - * - * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] b Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32 - * @param[in] output Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should be executed only for the first run + * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure() * * @return a status */ @@ -137,41 +113,8 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - std::unique_ptr _asm_glue; - std::unique_ptr _mm_kernel; - std::unique_ptr _mtx_a_reshape_kernel; - std::unique_ptr _mtx_b_reshape_kernel; - std::unique_ptr _mtx_a_reduction_kernel; - std::unique_ptr _mtx_b_reduction_kernel; - std::unique_ptr _offset_contribution_kernel; - std::unique_ptr _offset_contribution_output_stage_kernel; - NEActivationLayer _activation_func; - std::unique_ptr _convert_to_signed_asymm; - std::unique_ptr _convert_from_signed_asymm; - - Tensor _vector_sum_col; - Tensor _vector_sum_row; - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _mm_result_s32; - Tensor _signed_a; - Tensor _signed_output; - const ITensor *_original_b; - int32_t _a_offset; - int32_t _b_offset; - - bool _run_vector_matrix_multiplication; - bool _assembly_path; - bool _fused_assembly_path; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - bool _fuse_output_stage; - bool _run_activation; - bool _flip_signedness; - - ITensorPack _asm_glue_tensors{}; + struct Impl; + std::unique_ptr _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */ diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h index f9ebf608cb..b02c4ed5b7 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h @@ -96,20 +96,9 @@ public: void run() override; void prepare() override; - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer + /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradConvolutionLayer * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. - * Currently only 3x3 and 5x5 kernels are supported. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. - * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation - * available which may introduce a drop of accuracy as well. Default is false + * Similar to @ref NEWinogradConvolutionLayer::configure() * * @return a status */ diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h index e751e6025d..619a4ec122 100644 --- a/src/core/helpers/MemoryHelpers.h +++ b/src/core/helpers/MemoryHelpers.h @@ -70,7 +70,7 @@ WorkspaceData manage_workspace(const experimental::MemoryRequirement auto aux_tensor = workspace_memory.back().second.get(); ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor); - aux_tensor->allocator()->init(aux_info); + aux_tensor->allocator()->init(aux_info, req.alignment); if(req.lifetime == experimental::MemoryLifetime::Temporary) { diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 35126ec0d7..14b0633e09 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -41,14 +41,10 @@ using OperatorType = opencl::ClGemm; struct CLGEMM::Impl { - const ICLTensor *a{ nullptr }; const ICLTensor *b{ nullptr }; - const ICLTensor *c{ nullptr }; - ICLTensor *dst{ nullptr }; std::unique_ptr op{ nullptr }; MemoryGroup memory_group{}; IWeightsManager *weights_manager{ nullptr }; - CLTensor weights_transformed{}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryRequirements aux_mem_req{}; @@ -74,10 +70,7 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - _impl->a = a; _impl->b = b; - _impl->c = c; - _impl->dst = output; _impl->op = std::make_unique(); _impl->is_prepared = gemm_info.retain_internal_weights(); @@ -87,12 +80,12 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor // Manage/allocate auxilairy tensors if(_impl->is_prepared) { - _impl->run_pack.add_const_tensor(ACL_SRC_0, _impl->a); - _impl->run_pack.add_tensor(ACL_DST, _impl->dst); + _impl->run_pack.add_const_tensor(ACL_SRC_0, a); + _impl->run_pack.add_tensor(ACL_DST, output); } else { - _impl->run_pack = { { ACL_SRC_0, _impl->a }, { ACL_SRC_2, _impl->c }, { ACL_DST, _impl->dst } }; + _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_2, c }, { ACL_DST, output } }; _impl->prep_pack = { { ACL_SRC_1, _impl->b } }; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); @@ -110,7 +103,6 @@ void CLGEMM::run() MemoryGroupResourceScope scope_mg(_impl->memory_group); - ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->a, _impl->b, _impl->dst); _impl->op->run(_impl->run_pack); } diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 7318c3e492..9b14052c75 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -38,10 +38,12 @@ #include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" #include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" #include +using namespace arm_compute::experimental; using namespace arm_compute::misc::shape_calculator; namespace arm_compute @@ -61,9 +63,31 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) } // namespace NEGEMM::NEGEMM(std::shared_ptr memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(std::make_unique()), _ma_kernel(), - _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false), - _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false) + : _memory_group(memory_manager), + _weights_manager(weights_manager), + _interleave_kernel(), + _transpose_kernel(), + _mm_kernel(), + _asm_glue(std::make_unique()), + _ma_kernel(), + _alpha_scale_func(nullptr), + _add_bias(), + _activation_func(), + _tmp_a(), + _tmp_b(), + _tmp_d(), + _original_b(nullptr), + _run_vector_matrix_multiplication(false), + _run_alpha_scale(false), + _run_addition(false), + _run_bias_addition(false), + _run_activation(false), + _reshape_b_only_on_first_run(false), + _is_prepared(false), + _asm_glue_run_pack(), + _asm_glue_prep_pack(), + _asm_glue_workspace(), + _aux_mem_req() { } @@ -94,13 +118,16 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe _asm_glue->configure(a->info(), b->info(), c_info_to_use, d->info(), asm_info); ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured()); - _asm_glue_tensors = + _aux_mem_req = _asm_glue->workspace(); + _asm_glue_run_pack = { { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_SRC_2, c_to_use }, { ACL_DST, d }, }; + _asm_glue_prep_pack = { { ACL_SRC_1, b }, { ACL_SRC_2, c_to_use } }; + _asm_glue_workspace = manage_workspace(_aux_mem_req, _memory_group, _asm_glue_run_pack, _asm_glue_prep_pack); // Scale product by alpha if(_run_alpha_scale) @@ -323,7 +350,7 @@ void NEGEMM::run() if(_asm_glue->is_configured()) { - _asm_glue->run(_asm_glue_tensors); + _asm_glue->run(_asm_glue_run_pack); if(_run_alpha_scale) { _alpha_scale_func.run(); @@ -372,16 +399,20 @@ void NEGEMM::prepare() const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b); if(_asm_glue->is_configured()) { - if(!original_b_managed_by_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - } + _asm_glue->prepare(_asm_glue_prep_pack); - _asm_glue->prepare(_asm_glue_tensors); - if(!original_b_managed_by_weights_manager) + auto has_reshape = std::find_if(_aux_mem_req.begin(), + _aux_mem_req.end(), + [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + + if(has_reshape != std::end(_aux_mem_req)) { _original_b->mark_as_unused(); } + else + { + _asm_glue_run_pack.add_const_tensor(ACL_SRC_1, _original_b); + } } else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) { diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp index 564ce2f514..3ca5239ae3 100644 --- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -24,50 +24,98 @@ #include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/Tensor.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h" -#include - namespace arm_compute { using OperatorType = cpu::CpuGemmDirectConv2d; +using namespace arm_compute::experimental; struct NEGEMMConv2d::Impl { - ITensorPack tensors{}; - std::unique_ptr op{ nullptr }; + const ITensor *weights{ nullptr }; + std::unique_ptr op{ nullptr }; + ITensorPack run_pack{}; + ITensorPack prep_pack{}; + WorkspaceData workspace{}; + MemoryGroup memory_group{}; + bool is_prepared{ false }; + experimental::MemoryRequirements aux_mem_req{}; }; NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr &memory_manager) : _impl(std::make_unique()) { - _impl->op = std::make_unique(memory_manager); + _impl->memory_group = MemoryGroup(memory_manager); } NEGEMMConv2d::~NEGEMMConv2d() = default; void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) { - _impl->tensors.add_const_tensor(TensorType::ACL_SRC_0, input); - _impl->tensors.add_const_tensor(TensorType::ACL_SRC_1, weights); - _impl->tensors.add_const_tensor(TensorType::ACL_SRC_2, biases); - _impl->tensors.add_tensor(TensorType::ACL_DST, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + _impl->weights = weights; + _impl->is_prepared = false; + _impl->op = std::make_unique(); _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info); + + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = { { TensorType::ACL_SRC_0, input }, { TensorType::ACL_SRC_2, biases }, { TensorType::ACL_DST, output } }; + _impl->prep_pack = { { TensorType::ACL_SRC_1, weights }, { TensorType::ACL_SRC_2, biases } }; + _impl->workspace = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); } Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info) { return OperatorType::validate(input, weights, biases, output, info); } + void NEGEMMConv2d::run() { - _impl->op->run(_impl->tensors); + prepare(); + + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } + void NEGEMMConv2d::prepare() { - _impl->op->prepare(_impl->tensors); + if(!_impl->is_prepared) + { + _impl->op->prepare(_impl->prep_pack); + + auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), + _impl->aux_mem_req.end(), + [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + + if(has_reshape != std::end(_impl->aux_mem_req)) + { + _impl->weights->mark_as_unused(); + } + else + { + _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights); + } + + // Release temporary tensors that are only used in prepare stage + for(auto &ws : _impl->workspace) + { + const int slot = ws.first; + for(auto &m : _impl->aux_mem_req) + { + if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare) + { + auto tensor = ws.second.get(); + tensor->allocator()->free(); + break; + } + } + } + _impl->is_prepared = true; + } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index cc0f20e695..224fb1eb56 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -32,9 +32,14 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" #include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" @@ -61,17 +66,58 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) } } // namespace +struct NEGEMMLowpMatrixMultiplyCore::Impl +{ + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{ nullptr }; + std::unique_ptr asm_glue{ nullptr }; + std::unique_ptr mm_kernel{ nullptr }; + std::unique_ptr mtx_a_reshape_kernel{ nullptr }; + std::unique_ptr mtx_b_reshape_kernel{ nullptr }; + std::unique_ptr mtx_a_reduction_kernel{ nullptr }; + std::unique_ptr mtx_b_reduction_kernel{ nullptr }; + std::unique_ptr offset_contribution_kernel{ nullptr }; + std::unique_ptr offset_contribution_output_stage_kernel{ nullptr }; + std::unique_ptr activation_func{ nullptr }; + std::unique_ptr convert_to_signed_asymm{ nullptr }; + std::unique_ptr convert_from_signed_asymm{ nullptr }; + + Tensor vector_sum_col{}; + Tensor vector_sum_row{}; + Tensor tmp_a{}; + Tensor tmp_b{}; + Tensor mm_result_s32{}; + Tensor signed_a{}; + Tensor signed_output{}; + const ITensor *original_b{ nullptr }; + int32_t a_offset{ 0 }; + int32_t b_offset{ 0 }; + + bool run_vector_matrix_multiplication{ false }; + bool assembly_path{ false }; + bool fused_assembly_path{ false }; + bool reshape_b_only_on_first_run{ false }; + bool is_prepared{ false }; + bool fuse_output_stage{ false }; + bool run_activation{ false }; + bool flip_signedness{ false }; + + experimental::MemoryRequirements aux_mem_req{}; + ITensorPack asm_glue_run_pack{}; + ITensorPack asm_glue_prep_pack{}; + WorkspaceData asm_glue_workspace{}; +}; + +using namespace arm_compute::experimental; using namespace arm_compute::misc::shape_calculator; NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default; NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(std::make_unique(memory_manager, weights_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), - _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), - _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), - _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), - _run_activation(false), _flip_signedness(false) + : _impl(std::make_unique()) { + _impl->memory_group = MemoryGroup(memory_manager); + _impl->weights_manager = weights_manager; } void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info) @@ -85,53 +131,55 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, GEMMInfo info = gemm_info; // Set internal variables - _a_offset = a->info()->quantization_info().uniform().offset; - _b_offset = b->info()->quantization_info().uniform().offset; - _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; - _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); - _is_prepared = false; - _fused_assembly_path = false; - _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run; - _original_b = b; + _impl->a_offset = a->info()->quantization_info().uniform().offset; + _impl->b_offset = b->info()->quantization_info().uniform().offset; + _impl->run_vector_matrix_multiplication = a->info()->dimension(1) < 2; + _impl->reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); + _impl->is_prepared = false; + _impl->fused_assembly_path = false; + _impl->flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _impl->reshape_b_only_on_first_run; + _impl->original_b = b; + + _impl->asm_glue = std::make_unique(); const ITensor *a_to_use = a; // Convert to QASYMM8 -> QASYMM8_SIGNED and back - if(_flip_signedness) + if(_impl->flip_signedness) { const int32_t offset_correction = 128; const DataType dt = DataType::QASYMM8_SIGNED; const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform(); - _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction))); - _memory_group.manage(&_signed_a); - _convert_to_signed_asymm = std::make_unique(); - _convert_to_signed_asymm->configure(a_to_use, &_signed_a); - a_to_use = &_signed_a; - _a_offset = _signed_a.info()->quantization_info().uniform().offset; + _impl->signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction))); + _impl->memory_group.manage(&_impl->signed_a); + _impl->convert_to_signed_asymm = std::make_unique(); + _impl->convert_to_signed_asymm->configure(a_to_use, &_impl->signed_a); + a_to_use = &_impl->signed_a; + _impl->a_offset = _impl->signed_a.info()->quantization_info().uniform().offset; const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform(); - _memory_group.manage(&_signed_output); - _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction))); + _impl->memory_group.manage(&_impl->signed_output); + _impl->signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction))); // Output stage correction GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); - output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset; + output_stage_corr.gemmlowp_offset = _impl->signed_output.info()->quantization_info().uniform().offset; output_stage_corr.gemmlowp_min_bound -= offset_correction; output_stage_corr.gemmlowp_max_bound -= offset_correction; info.set_gemmlowp_output_stage(output_stage_corr); // Update matrix a - matrix_a = &_signed_a; + matrix_a = &_impl->signed_a; } // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { - _fuse_output_stage = true; - _memory_group.manage(&_mm_result_s32); + _impl->fuse_output_stage = true; + _impl->memory_group.manage(&_impl->mm_result_s32); TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32); - _mm_result_s32.allocator()->init(info_mm_result_s32); + _impl->mm_result_s32.allocator()->init(info_mm_result_s32); } // Initialize assembly kernel meta-data @@ -147,20 +195,28 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { auto c_info_to_use = c == nullptr ? nullptr : c->info(); - _asm_glue->configure(a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info); - _fused_assembly_path = _asm_glue->is_configured(); - _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c); - _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output); + _impl->asm_glue->configure(a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info); + _impl->fused_assembly_path = _impl->asm_glue->is_configured(); + _impl->asm_glue_run_pack.add_const_tensor(TensorType::ACL_SRC_2, c); + _impl->asm_glue_run_pack.add_tensor(TensorType::ACL_DST, output); } else { - auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : output); - _asm_glue->configure(a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info); - _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use); + auto output_to_use = (_impl->fuse_output_stage ? &_impl->mm_result_s32 : output); + _impl->asm_glue->configure(a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info); + _impl->asm_glue_run_pack.add_tensor(TensorType::ACL_DST, output_to_use); + } + _impl->assembly_path = _impl->asm_glue->is_configured(); + + if(_impl->assembly_path) + { + _impl->asm_glue_run_pack.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); + + _impl->aux_mem_req = _impl->asm_glue->workspace(); + _impl->asm_glue_prep_pack = { { TensorType::ACL_SRC_1, b }, { TensorType::ACL_SRC_2, c } }; + + _impl->asm_glue_workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->asm_glue_run_pack, _impl->asm_glue_prep_pack); } - _assembly_path = _asm_glue->is_configured(); - _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); - _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); break; } default: @@ -170,142 +226,144 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, } } #endif /* __aarch64__ */ - if(!(_assembly_path || _run_vector_matrix_multiplication)) + if(!(_impl->assembly_path || _impl->run_vector_matrix_multiplication)) { - matrix_a = &_tmp_a; - matrix_b = &_tmp_b; + matrix_a = &_impl->tmp_a; + matrix_b = &_impl->tmp_b; // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info()); // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info()); - _tmp_a.allocator()->init(a_info); - _tmp_b.allocator()->init(b_info); - _memory_group.manage(&_tmp_a); - if(!_reshape_b_only_on_first_run) + _impl->tmp_a.allocator()->init(a_info); + _impl->tmp_b.allocator()->init(b_info); + _impl->memory_group.manage(&_impl->tmp_a); + if(!_impl->reshape_b_only_on_first_run) { - _memory_group.manage(&_tmp_b); + _impl->memory_group.manage(&_impl->tmp_b); } // Configure interleave kernel - _mtx_a_reshape_kernel = std::make_unique(); - _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a); + _impl->mtx_a_reshape_kernel = std::make_unique(); + _impl->mtx_a_reshape_kernel->configure(a_to_use, &_impl->tmp_a); // Configure transpose kernel - _mtx_b_reshape_kernel = std::make_unique(); - _mtx_b_reshape_kernel->configure(b, &_tmp_b); + _impl->mtx_b_reshape_kernel = std::make_unique(); + _impl->mtx_b_reshape_kernel->configure(b, &_impl->tmp_b); } - if(!_fused_assembly_path) + if(!_impl->fused_assembly_path) { // Build reduction info const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false); - // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) + // Initialize matrix B reduction kernel only if _impl->a_offset is not equal to 0 + if(_impl->a_offset != 0) { TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); - _vector_sum_col.allocator()->init(info_vector_sum_col); - if(!_reshape_b_only_on_first_run) + _impl->vector_sum_col.allocator()->init(info_vector_sum_col); + if(!_impl->reshape_b_only_on_first_run) { - _memory_group.manage(&_vector_sum_col); + _impl->memory_group.manage(&_impl->vector_sum_col); } // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel = std::make_unique(); - _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info); + _impl->mtx_b_reduction_kernel = std::make_unique(); + _impl->mtx_b_reduction_kernel->configure(b, &_impl->vector_sum_col, reduction_info); } - // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + // Initialize Matrix A reduction kernel only if _impl->b_offset is not equal to 0 + if(_impl->b_offset != 0) { TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32); - _vector_sum_row.allocator()->init(info_vector_sum_row); - _memory_group.manage(&_vector_sum_row); + _impl->vector_sum_row.allocator()->init(info_vector_sum_row); + _impl->memory_group.manage(&_impl->vector_sum_row); // Configure matrix A reduction kernel - _mtx_a_reduction_kernel = std::make_unique(); - _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info); + _impl->mtx_a_reduction_kernel = std::make_unique(); + _impl->mtx_a_reduction_kernel->configure(a_to_use, &_impl->vector_sum_row, reduction_info); } - if(_fuse_output_stage) + if(_impl->fuse_output_stage) { // Configure matrix multiply kernel - if(!_assembly_path) + if(!_impl->assembly_path) { - _mm_kernel = std::make_unique(); - _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32); + _impl->mm_kernel = std::make_unique(); + _impl->mm_kernel->configure(matrix_a, matrix_b, &_impl->mm_result_s32); } - _offset_contribution_output_stage_kernel = std::make_unique(); - _offset_contribution_output_stage_kernel->configure(&_mm_result_s32, - _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, - _flip_signedness ? &_signed_output : output, - a->info()->dimension(0), - _a_offset, _b_offset, info.gemmlowp_output_stage()); + _impl->offset_contribution_output_stage_kernel = std::make_unique(); + _impl->offset_contribution_output_stage_kernel->configure(&_impl->mm_result_s32, + _impl->a_offset == 0 ? nullptr : &_impl->vector_sum_col, + _impl->b_offset == 0 ? nullptr : &_impl->vector_sum_row, c, + _impl->flip_signedness ? &_impl->signed_output : output, + a->info()->dimension(0), + _impl->a_offset, _impl->b_offset, info.gemmlowp_output_stage()); - if(_flip_signedness) + if(_impl->flip_signedness) { - _convert_from_signed_asymm = std::make_unique(); - _convert_from_signed_asymm->configure(&_signed_output, output); + _impl->convert_from_signed_asymm = std::make_unique(); + _impl->convert_from_signed_asymm->configure(&_impl->signed_output, output); } } else { // Configure matrix multiply kernel - if(!_assembly_path) + if(!_impl->assembly_path) { - _mm_kernel = std::make_unique(); - _mm_kernel->configure(matrix_a, matrix_b, output); + _impl->mm_kernel = std::make_unique(); + _impl->mm_kernel->configure(matrix_a, matrix_b, output); } // Configure offset contribution kernel - _offset_contribution_kernel = std::make_unique(); - _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset); + _impl->offset_contribution_kernel = std::make_unique(); + _impl->offset_contribution_kernel->configure(output, _impl->a_offset == 0 ? nullptr : &_impl->vector_sum_col, _impl->b_offset == 0 ? nullptr : &_impl->vector_sum_row, a_to_use->info()->dimension(0), + _impl->a_offset, _impl->b_offset); } } // Configure activation const ActivationLayerInfo &activation = gemm_info.activation_info(); - _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); - if(_run_activation) + _impl->run_activation = activation.enabled() && (!_impl->assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); + if(_impl->run_activation) { - _activation_func.configure(output, nullptr, activation); + _impl->activation_func = std::make_unique(); + _impl->activation_func->configure(output, nullptr, activation); } // Allocate tensors - if(!_assembly_path && !_run_vector_matrix_multiplication) + if(!_impl->assembly_path && !_impl->run_vector_matrix_multiplication) { - _tmp_a.allocator()->allocate(); - if(!_reshape_b_only_on_first_run) + _impl->tmp_a.allocator()->allocate(); + if(!_impl->reshape_b_only_on_first_run) { - _tmp_b.allocator()->allocate(); + _impl->tmp_b.allocator()->allocate(); } } - if(!_fused_assembly_path) + if(!_impl->fused_assembly_path) { - if(_a_offset != 0 && !_reshape_b_only_on_first_run) + if(_impl->a_offset != 0 && !_impl->reshape_b_only_on_first_run) { - _vector_sum_col.allocator()->allocate(); + _impl->vector_sum_col.allocator()->allocate(); } - if(_b_offset != 0) + if(_impl->b_offset != 0) { - _vector_sum_row.allocator()->allocate(); + _impl->vector_sum_row.allocator()->allocate(); } } - if(_fuse_output_stage) + if(_impl->fuse_output_stage) { - _mm_result_s32.allocator()->allocate(); + _impl->mm_result_s32.allocator()->allocate(); } - if(_flip_signedness) + if(_impl->flip_signedness) { - _signed_a.allocator()->allocate(); - _signed_output.allocator()->allocate(); + _impl->signed_a.allocator()->allocate(); + _impl->signed_output.allocator()->allocate(); } } @@ -509,118 +567,112 @@ void NEGEMMLowpMatrixMultiplyCore::run() { prepare(); - MemoryGroupResourceScope scope_mg(_memory_group); + MemoryGroupResourceScope scope_mg(_impl->memory_group); // Convert QASYMM8->QASYMM8_SIGNED - if(_flip_signedness) + if(_impl->flip_signedness) { - NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY); + NEScheduler::get().schedule(_impl->convert_to_signed_asymm.get(), Window::DimY); } // Run GEMM - if(_asm_glue->is_configured()) + if(_impl->asm_glue->is_configured()) { - _asm_glue->run(_asm_glue_tensors); + _impl->asm_glue->run(_impl->asm_glue_run_pack); } else { - if(!_run_vector_matrix_multiplication) + if(!_impl->run_vector_matrix_multiplication) { // Run interleave kernel - NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); + NEScheduler::get().schedule(_impl->mtx_a_reshape_kernel.get(), Window::DimY); - if(!_reshape_b_only_on_first_run) + if(!_impl->reshape_b_only_on_first_run) { // Run transpose kernel - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + NEScheduler::get().schedule(_impl->mtx_b_reshape_kernel.get(), Window::DimY); } } - NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); + NEScheduler::get().schedule(_impl->mm_kernel.get(), Window::DimY); } - if(!_fused_assembly_path) + if(!_impl->fused_assembly_path) { - // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + // Run matrix A reduction kernel only if _impl->b_offset is not equal to 0 + if(_impl->b_offset != 0) { - NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX); + NEScheduler::get().schedule(_impl->mtx_a_reduction_kernel.get(), Window::DimX); } - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) + // Run matrix B reduction kernel only if _impl->a_offset is not equal to 0 + if(_impl->a_offset != 0 && !_impl->reshape_b_only_on_first_run) { - NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX); + NEScheduler::get().schedule(_impl->mtx_b_reduction_kernel.get(), Window::DimX); } - if(_fuse_output_stage) + if(_impl->fuse_output_stage) { // Run offset contribution kernel - NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY); + NEScheduler::get().schedule(_impl->offset_contribution_output_stage_kernel.get(), Window::DimY); } else { // Run offset contribution kernel - NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY); + NEScheduler::get().schedule(_impl->offset_contribution_kernel.get(), Window::DimY); } } // Convert QASYMM8_SIGNED->QASYMM8 - if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness) + if(!_impl->fused_assembly_path && _impl->fuse_output_stage && _impl->flip_signedness) { - NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY); + NEScheduler::get().schedule(_impl->convert_from_signed_asymm.get(), Window::DimY); } // Run fused activation unless already run in the fused assembly - if(_run_activation) + if(_impl->run_activation) { - _activation_func.run(); + _impl->activation_func->run(); } } void NEGEMMLowpMatrixMultiplyCore::prepare() { - if(!_is_prepared) + if(!_impl->is_prepared) { - const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b); // Run assembly reshape - if(_asm_glue->is_configured()) + if(_impl->asm_glue->is_configured()) { - if(!original_b_managed_by_weights_manager) + _impl->asm_glue->prepare(_impl->asm_glue_prep_pack); + + auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), + _impl->aux_mem_req.end(), + [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + + if(has_reshape != std::end(_impl->aux_mem_req)) { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + _impl->original_b->mark_as_unused(); } - - _asm_glue->prepare(_asm_glue_tensors); - if(!original_b_managed_by_weights_manager) + else { - _original_b->mark_as_unused(); + _impl->asm_glue_run_pack.add_const_tensor(ACL_SRC_1, _impl->original_b); } } // Run non-assembly reshape - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) + else if(_impl->reshape_b_only_on_first_run && !_impl->run_vector_matrix_multiplication && !_impl->asm_glue->is_configured()) { - if(!original_b_managed_by_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - } - // Run reshape kernel and mark original weights tensor as unused - _tmp_b.allocator()->allocate(); - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); - if(!original_b_managed_by_weights_manager) - { - _original_b->mark_as_unused(); - } + _impl->tmp_b.allocator()->allocate(); + NEScheduler::get().schedule(_impl->mtx_b_reshape_kernel.get(), Window::DimY); } - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) + // Run matrix B reduction kernel only if _impl->a_offset is not equal to 0 + if(!_impl->fused_assembly_path && _impl->a_offset != 0 && _impl->reshape_b_only_on_first_run) { - _vector_sum_col.allocator()->allocate(); - NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX); + _impl->vector_sum_col.allocator()->allocate(); + NEScheduler::get().schedule(_impl->mtx_b_reduction_kernel.get(), Window::DimX); } - _is_prepared = true; + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index 0bf1738bec..57950d5126 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -29,12 +29,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" -#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" -#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" #include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" #include "src/core/NEON/kernels/convolution/common/utils.hpp" #include "src/core/NEON/kernels/convolution/winograd/winograd.hpp" diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp index e50099df1f..c2e9f24ff6 100644 --- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp +++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/FunctionDescriptors.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/runtime/cpu/operators/CpuActivation.h" -#include "src/runtime/cpu/operators/CpuPermute.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" + +#include "support/Cast.h" #include @@ -37,6 +37,9 @@ namespace arm_compute { namespace cpu { +using namespace arm_compute::experimental; +using namespace arm_compute::utils::cast; + namespace { GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act) @@ -87,12 +90,14 @@ cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect } } // namespace -CpuGemmDirectConv2d::CpuGemmDirectConv2d(const std::shared_ptr &memory_manager) - : _gemm_asm_func(std::make_unique(memory_manager)), +CpuGemmDirectConv2d::CpuGemmDirectConv2d() + : _gemm_asm_func(std::make_unique()), _activation_func(std::make_unique()), _weights_permute_func(std::make_unique()), - _permuted_weights_info(), - _permuted_weights(std::make_unique()) + _aux_mem(AuxTensorIdx::Count), + _perm_weights(), + _run_activation(false), + _is_prepared(false) { } @@ -106,8 +111,10 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *w biases != nullptr ? biases : nullptr, dst, info)); - _original_weights_info = weights; - _weights_permute_func->configure(weights, &_permuted_weights_info, PermutationVector{ 3, 0, 1, 2 }); + _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info); + _is_prepared = false; + + _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 }); // Configure assembly dispatch cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); @@ -115,13 +122,27 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *w { asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info); } - _gemm_asm_func->configure(src, &_permuted_weights_info, biases, dst, asm_info); + _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info); // Configure activation - if(info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info)) + if(_run_activation) { _activation_func->configure(dst, nullptr, info.act_info); - _run_activation = true; + } + + // Add auxiliary memory requirements of the assembly dispatch + auto asm_mem_req = _gemm_asm_func->workspace(); + _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; + _aux_mem[Pretranspose] = asm_mem_req[Pretranspose]; + + if(_aux_mem[Pretranspose].size > 0) + { + // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch + _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size()); + } + else + { + _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size()); } } Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info) @@ -172,35 +193,29 @@ void CpuGemmDirectConv2d::run(ITensorPack &tensors) } } -void CpuGemmDirectConv2d::allocate_permuted_weights() -{ - // TODO: This function will be removed when memory injection is implemeted. - ARM_COMPUTE_ERROR_ON(_permuted_weights == nullptr); - _permuted_weights->allocator()->free(); - _permuted_weights->allocator()->init(_permuted_weights_info); - _permuted_weights->allocator()->allocate(); -} - void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) { if(!_is_prepared) { - allocate_permuted_weights(); - ITensorPack permute_tensors - { - { TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1) }, - { TensorType::ACL_DST, _permuted_weights.get() }, - }; + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(PermutedWeights))); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux); + CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux); + ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } }; _weights_permute_func->run(permute_tensors); - tensors.get_const_tensor(TensorType::ACL_SRC_1)->mark_as_unused(); + tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get()); + // Call prepare of assembly dispatch + _gemm_asm_func->prepare(tensors); - // switch the original tensor with permuted tensor - tensors.add_const_tensor(TensorType::ACL_SRC_1, _permuted_weights.get()); _is_prepared = true; } } +experimental::MemoryRequirements CpuGemmDirectConv2d::workspace() const +{ + return _aux_mem; +} } // namespace cpu } // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h index 6aa17c2349..b572f36a3a 100644 --- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h +++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h @@ -24,14 +24,12 @@ #ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H #define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/experimental/Types.h" -#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/core/TensorInfo.h" #include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" #include "src/runtime/cpu/ICpuOperator.h" - -#include +#include "src/runtime/cpu/operators/CpuActivation.h" +#include "src/runtime/cpu/operators/CpuPermute.h" +#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" namespace arm_compute { @@ -40,15 +38,11 @@ class ITensor; struct Conv2dInfo; namespace cpu { -class CpuGemmAssemblyDispatch; -class CpuActivation; -class CpuPermute; - class CpuGemmDirectConv2d : public ICpuOperator { public: /** Constructor */ - CpuGemmDirectConv2d(const std::shared_ptr &memory_manager = nullptr); + CpuGemmDirectConv2d(); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d); /** Destructor */ ~CpuGemmDirectConv2d(); @@ -89,22 +83,24 @@ public: // Inherited methods overridden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; private: + enum AuxTensorIdx + { + AsmGemmWorkspace = 0, + Pretranspose, + PermutedWeights, + Count + }; + std::unique_ptr _gemm_asm_func; std::unique_ptr _activation_func; std::unique_ptr _weights_permute_func; - const ITensorInfo *_original_weights_info{}; - TensorInfo _permuted_weights_info; - std::unique_ptr _permuted_weights{ nullptr }; - bool _is_prepared{ false }; - bool _run_activation{ false }; - - /** Function to allocated a tensor for permuted weights - * - * @note This function will be removed when memory injection is properly implemented. - */ - void allocate_permuted_weights(); + experimental::MemoryRequirements _aux_mem; + TensorInfo _perm_weights; + bool _run_activation; + bool _is_prepared; }; } // namespace cpu } // namespace arm_compute diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 1101e05a0d..79ea1cb5a7 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -27,15 +27,18 @@ #include "src/core/CPP/Validate.h" #include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" #include "src/core/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/core/helpers/MemoryHelpers.h" #include "src/core/utils/AssemblyUtils.h" +#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" #include -#include namespace arm_compute { namespace cpu { +using namespace arm_compute::experimental; + namespace { struct free_delete @@ -113,103 +116,27 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp return scheduling_hint; } -template -class FallbackTransform : public ITransformWeights -{ -public: - FallbackTransform() noexcept {}; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - FallbackTransform(const FallbackTransform &) = delete; - /** Default move constructor */ - FallbackTransform(FallbackTransform &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - FallbackTransform &operator=(const FallbackTransform &) = delete; - /** Default move assignment operator */ - FallbackTransform &operator=(FallbackTransform &&) = default; - void run() override - { - _output.allocator()->allocate(); - ARM_COMPUTE_ERROR_ON(_output.buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(_output.buffer(), _in1_ptr, _ldb, _multi_stride_b); - _reshape_run = true; - } - - void release() override - { - _output.allocator()->free(); - } - - ITensor *get_weights() override - { - return &_output; - } - - uint32_t uid() override - { - uint32_t id = (_B_pretranspose_size | 0x80000000); - return id; - } - - void configure(size_t B_pretranspose_size, unsigned int alignment) - { - _output.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment); - _B_pretranspose_size = B_pretranspose_size; - } - - void set_pretranspose(ITensor *tensor) - { - if(!_reshape_run) - { - _gemm_kernel_asm->set_pretransposed_B_data(tensor->buffer()); - } - } - - void set_args(const int ldb, const TypeInput *in1_ptr, const int multi_stride_b, std::shared_ptr> gemm_kernel_asm) - { - _ldb = ldb; - _in1_ptr = in1_ptr; - _multi_stride_b = multi_stride_b; - _gemm_kernel_asm = gemm_kernel_asm; - } - -private: - Tensor _output{}; - int _ldb{}; - const TypeInput *_in1_ptr{}; - int _multi_stride_b{}; - size_t _B_pretranspose_size{}; - std::shared_ptr> _gemm_kernel_asm{ nullptr }; -}; - /** Fallback in case ACL doesn't have a function */ template class Fallback : public CpuGemmAssemblyDispatch::IFallback { public: /** Destructor */ - ~Fallback() - { - if(_pretranspose && !(is_weight_managed())) - { - delete _pretranspose; - } - } + ~Fallback() = default; /** Initialise the functions's input and output. * - * @param[in] a Input tensor containing the Matrix A. - * @param[in] b Input tensor containing the Matrix B. - * @param[in] c Input tensor containing the Matrix C. - * @param[out] d Output tensor to store the result of matrix multiplication. - * @param[in] args Matrix multiplication information. - * @param[in] gemm_info GEMM meta-data - * @param[in] memory_group Memory group to be used by the function. - * @param[in] weights_manager Weights manager to be used by the function. - * @param[in] os Output stage meta-data. + * @param[in] a Input tensor containing the Matrix A. + * @param[in] b Input tensor containing the Matrix B. + * @param[in] c Input tensor containing the Matrix C. + * @param[out] d Output tensor to store the result of matrix multiplication. + * @param[in] args Matrix multiplication information. + * @param[in] gemm_info GEMM meta-data + * @param[in] os Output stage meta-data. */ void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {}); + const OutputStage &os = {}); /** Set requantization shifts to be used * @@ -231,16 +158,17 @@ public: // Inherited methods overridden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; - bool is_configured() const override; + bool is_configured() const override; + experimental::MemoryRequirements workspace() const override; private: - /** Allocate a workspace tensor. - * - * @param[in] workspace_size Size to allocate. - * @param[in] memory_group Tensor memory group. - * @param[in] alignment Workspace memory alignment. - */ - void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment); + enum AuxTensorIdx + { + AsmGemmWorkspace = 0, + Pretranspose, + Count + }; + /** Configure the indirect buffer * * @param[in] a Input tensor containing the Matrix A. @@ -256,18 +184,14 @@ private: std::shared_ptr> _gemm_kernel_asm{ nullptr }; /** Optimised Arm® Neon™ kernel */ std::unique_ptr _optimised_kernel{ nullptr }; - /** GEMM workspace */ - Tensor _workspace{}; - /** Pre-transpose tensor */ - ITensor *_pretranspose{ nullptr }; + /** Assembly GEMM workspace tensor info */ + TensorInfo _workspace_info{}; + /** Pre-transpose tensor info */ + TensorInfo _pretranspose_info{}; /** Prepared flag */ bool _is_prepared{ false }; /** GEMM meta-data */ AsmGemmInfo _gemm_info{}; - /** Weights manager */ - IWeightsManager *_weights_manager{ nullptr }; - /** Weights transform object */ - FallbackTransform _weights_transform{}; /** GEMM kernel description */ arm_gemm::KernelDescription _kernel_info{}; /** Per channel quantization shifts */ @@ -279,27 +203,9 @@ private: /** Indirect buffer */ std::unique_ptr _indirect_arg{}; std::unique_ptr _indirect_buf{}; - std::vector _indirect_pad{}; - arm_gemm::ConvolutionParameters _cp{}; - - bool is_weight_managed() - { - // TODO (COMPMID-4539): This function should do the following: - // _weights_manager && _weights_manager->are_weights_managed(_b) - // , where _b is the second Tensor that is used to be given to the configure(). - // Currently, however, weight manager is disabled to make this class stateless. - // This should be revisited in the future. - return false; - } - - void acquire_managed_weight() - { - // TODO (COMPMID-4539): This function should do the following: - // _pretranspose = _weights_manager->acquire(_b, &_weights_transform); - // , where _b is the second Tensor that is used to be given to the configure(). - // Currently, however, weight manager is disabled to make this class stateless. - _pretranspose = nullptr; - } + std::vector _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; + experimental::MemoryRequirements _aux_mem{ Count }; }; template @@ -439,12 +345,11 @@ void Fallback::configure_indirect(const ITen template void Fallback::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os) + const OutputStage &os) { ARM_COMPUTE_UNUSED(c); arm_gemm::GemmConfig gemm_cfg; - _kernel_info = arm_gemm::get_gemm_method(args, os); - _weights_manager = weights_manager; + _kernel_info = arm_gemm::get_gemm_method(args, os); if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED) { gemm_cfg.filter = _kernel_info.name; @@ -461,13 +366,10 @@ void Fallback::configure(const ITensorInfo * auto acl_gemm_wrapper = std::make_unique>(); ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr); acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter); - const size_t workspace_size = _gemm_kernel_asm->get_working_size(); - if(workspace_size > 0) - { - // Allocate workspace - const unsigned int alignment = 4096; - allocate_workspace(workspace_size, memory_group, alignment); - } + const size_t workspace_size = _gemm_kernel_asm->get_working_size(); + const unsigned int alignment = 4096; + _workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8); + _aux_mem[AsmGemmWorkspace] = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 @@ -487,16 +389,8 @@ void Fallback::configure(const ITensorInfo * // Forcing 128-byte alignment (required by 32-bit kernels) const unsigned int alignment = 128; const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); - if(is_weight_managed()) - { - _weights_transform.configure(B_pretranspose_size, alignment); - acquire_managed_weight(); - } - else - { - _pretranspose = new Tensor(); - static_cast(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment); - } + _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8); + _aux_mem[Pretranspose] = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); } // Handle indirect GEMM convolution @@ -509,10 +403,11 @@ void Fallback::configure(const ITensorInfo * template void Fallback::prepare(ITensorPack &tensors) { - auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); if(!_is_prepared) { + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); + // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. if(c && c->info()->data_type() == DataType::S32) { @@ -526,24 +421,9 @@ void Fallback::prepare(ITensorPack &tensors) const auto in1_ptr = reinterpret_cast(b->buffer() + b->info()->offset_first_element_in_bytes()); const int multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput); - if(is_weight_managed()) - { - _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm); - _weights_manager->run(b, &_weights_transform); - - // If we didn't run the reshape function, set the pretransposed buffer - if(!_weights_transform.is_reshape_run()) - { - _weights_transform.set_pretranspose(_pretranspose); - } - } - else - { - static_cast(_pretranspose)->allocator()->allocate(); - ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b); - b->mark_as_unused(); - } + CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); + ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); + _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b); } if(_gemm_info.method == AsmConvMethod::Indirect) @@ -556,18 +436,15 @@ void Fallback::prepare(ITensorPack &tensors) } template -void Fallback::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment) +bool Fallback::is_configured() const { - ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0"); - _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment); - memory_group.manage(&_workspace); - _workspace.allocator()->allocate(); + return _optimised_kernel != nullptr; } template -bool Fallback::is_configured() const +experimental::MemoryRequirements Fallback::workspace() const { - return _optimised_kernel != nullptr; + return _aux_mem; } template @@ -609,9 +486,10 @@ void Fallback::run(ITensorPack &tensors) const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type()); // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads - if(_workspace.buffer() != nullptr) + CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false); + if(workspace.get()->buffer() != nullptr) { - _gemm_kernel_asm->set_working_space(reinterpret_cast(_workspace.buffer())); + _gemm_kernel_asm->set_working_space(reinterpret_cast(workspace.get()->buffer())); const unsigned int split_dim = scheduling_hint.split_dimension(); const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); unsigned int num_threads = NEScheduler::get().num_threads(); @@ -656,9 +534,9 @@ void Fallback::run(ITensorPack &tensors) } template -void create_arm_gemm(std::unique_ptr &arm_gemm, MemoryGroup &memory_group, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, - IWeightsManager *weights_manager) +void create_arm_gemm(std::unique_ptr &arm_gemm, + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, + arm_gemm::Activation activation, const AsmGemmInfo &info) { Params p = extract_parameters(a, b, d, info); const CPUInfo &ci = NEScheduler::get().cpu_info(); @@ -668,14 +546,14 @@ void create_arm_gemm(std::unique_ptr &arm_ge // Create arm_gemm fallback auto fallback = std::make_unique>(); - fallback->configure(a, b, c, d, args, info, memory_group, weights_manager); + fallback->configure(a, b, c, d, args, info); arm_gemm = std::move(fallback); } template -void create_arm_gemm_quant(std::unique_ptr &arm_gemm, MemoryGroup &memory_group, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, - IWeightsManager *weights_manager) +void create_arm_gemm_quant(std::unique_ptr &arm_gemm, + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, + arm_gemm::Activation activation, const AsmGemmInfo &info) { ARM_COMPUTE_UNUSED(activation); Params p = extract_parameters(a, b, d, info); @@ -713,14 +591,13 @@ void create_arm_gemm_quant(std::unique_ptr & } // Configure fallback - fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info); + fallback->configure(a, b, c, d, args, info, gemm_requant_info); arm_gemm = std::move(fallback); } - } //namespace -CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch(std::shared_ptr memory_manager, IWeightsManager *weights_manager) - : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager) +CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() + : _arm_gemm(nullptr) { } @@ -775,40 +652,40 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo switch(a->data_type()) { case DataType::F32: - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, a, b, c, d, act, info); break; #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: if(d->data_type() == DataType::S32) { - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, a, b, c, d, act, info); } else { - create_arm_gemm_quant(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm_quant(_arm_gemm, a, b, c, d, act, info); } break; case DataType::S8: case DataType::QASYMM8_SIGNED: if(d->data_type() == DataType::S32) { - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, a, b, c, d, act, info); } else { - create_arm_gemm_quant(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm_quant(_arm_gemm, a, b, c, d, act, info); } break; #endif /* __aarch64__ */ #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) case DataType::BFLOAT16: - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, a, b, c, d, act, info); break; #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, a, b, c, d, act, info); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: @@ -829,10 +706,14 @@ bool CpuGemmAssemblyDispatch::is_configured() const void CpuGemmAssemblyDispatch::run(ITensorPack &tensors) { - MemoryGroupResourceScope scope_mg(_memory_group); - ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); _arm_gemm->run(tensors); } + +experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const +{ + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + return _arm_gemm->workspace(); +} } // namespace cpu } // namespace arm_compute diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h index ffc097c75c..355273adeb 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -24,10 +24,6 @@ #ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H #define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/IWeightsManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/Tensor.h" #include "src/core/common/Macros.h" #include "src/runtime/cpu/ICpuOperator.h" @@ -62,7 +58,7 @@ class CpuGemmAssemblyDispatch : public ICpuOperator { public: /** Constructor */ - CpuGemmAssemblyDispatch(std::shared_ptr memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); + CpuGemmAssemblyDispatch(); /** Defautl destructor */ ~CpuGemmAssemblyDispatch() = default; @@ -71,10 +67,11 @@ public: class IFallback { public: - virtual void run(ITensorPack &tensors) = 0; - virtual void prepare(ITensorPack &tensors) = 0; - virtual bool is_configured() const = 0; - virtual ~IFallback() = default; + virtual void run(ITensorPack &tensors) = 0; + virtual void prepare(ITensorPack &tensors) = 0; + virtual experimental::MemoryRequirements workspace() const = 0; + virtual bool is_configured() const = 0; + virtual ~IFallback() = default; }; public: @@ -115,11 +112,10 @@ public: // Inherited methods overridden: void prepare(ITensorPack &tensors) override; void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; private: - std::unique_ptr _arm_gemm; /**< Interface for the arm_gemm fallback */ - MemoryGroup _memory_group; /**< Function memory group */ - IWeightsManager *_weights_manager; /**< Pointer to the weights manager */ + std::unique_ptr _arm_gemm; /**< Interface for the arm_gemm fallback */ }; } // namespace cpu } // namespace arm_compute diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index 9e00da16ae..be01655a86 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -28,6 +28,8 @@ #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h" #include "tests/NEON/Accessor.h" #include "tests/PaddingCalculator.h" #include "tests/datasets/LargeConvolutionLayerDataset.h" @@ -571,6 +573,99 @@ TEST_SUITE(DirectGEMMConv2d) template using NEDirectGEMMConv2dLayerFixture = ConvolutionValidationFixture; +/** Test case for memory injection in @ref cpu::CpuGemmDirectConv2d. + * + * Configure the operator once and inject memory at run-time in multiple executions. + * + * Checks performed in order: + * - Both runs compute the same output + */ +TEST_CASE(MemoryInjection, framework::DatasetMode::ALL) +{ + auto conv = std::make_unique(); + const auto src_info = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC); + const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NHWC); + const auto bias_info = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC); + auto dst_info = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NHWC); + const auto conv_info = Conv2dInfo{}; + conv->configure(&src_info, &weight_info, &bias_info, &dst_info, conv_info); + + // tensors are newly created every call of this lambda function + auto src = create_tensor(src_info); + auto weight = create_tensor(weight_info); + auto bias = create_tensor(bias_info); + src.allocator()->allocate(); + weight.allocator()->allocate(); + bias.allocator()->allocate(); + + ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } }; + ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } }; + + auto mg = MemoryGroup{}; + auto ws = manage_workspace(conv->workspace(), mg, run_pack, prep_pack); + + auto run_conv = [&]() -> Tensor + { + auto dst = create_tensor(dst_info); + dst.allocator()->allocate(); + run_pack.add_tensor(TensorType::ACL_DST, &dst); + + library->fill_tensor_value(Accessor(src), 1.f); + library->fill_tensor_value(Accessor(weight), 2.f); + library->fill_tensor_value(Accessor(bias), 3.f); + // This operator is configured once and captured by this lambda. + conv->prepare(prep_pack); + conv->run(run_pack); + return dst; + }; + auto result_0 = run_conv(); + auto result_1 = run_conv(); + for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i) + { + ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS); + } +} + +/** Test case for memory injection in @ref NEGEMMConv2d. + * + * Make sure @ref NEGEMMConv2d still works through injecting the memory at configure time using the old API. + * + * Checks performed in order: + * - Both runs compute the same output + */ +TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL) +{ + auto conv = std::make_unique(); + const auto src_info = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC); + const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NHWC); + const auto bias_info = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC); + auto dst_info = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NHWC); + const auto conv_info = Conv2dInfo{}; + auto run_conv = [&]() + { + auto src = create_tensor(src_info); + auto weight = create_tensor(weight_info); + auto bias = create_tensor(bias_info); + auto dst = create_tensor(dst_info); + conv->configure(&src, &weight, &bias, &dst, conv_info); + src.allocator()->allocate(); + weight.allocator()->allocate(); + bias.allocator()->allocate(); + dst.allocator()->allocate(); + library->fill_tensor_value(Accessor(src), 1.f); + library->fill_tensor_value(Accessor(weight), 2.f); + library->fill_tensor_value(Accessor(bias), 3.f); + conv->run(); + return dst; + }; + auto result_0 = run_conv(); + auto result_1 = run_conv(); + for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i) + { + ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS); + } +} + TEST_SUITE(Float) TEST_SUITE(FP32) FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(), -- cgit v1.2.1