From d7316eb877cc4ff8573219374335e917b19a0203 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Wed, 16 Jun 2021 11:14:41 +0100 Subject: Port NEGEMMConv2d to memory injecting interface Resolves: COMPMID-4506, COMPMID-4570 Change-Id: I6d37a06da141f1fcfcaa8525322a319cb0234791 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5824 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../operators/internal/CpuGemmAssemblyDispatch.cpp | 265 ++++++--------------- 1 file changed, 73 insertions(+), 192 deletions(-) (limited to 'src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp') diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 1101e05a0d..79ea1cb5a7 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -27,15 +27,18 @@ #include "src/core/CPP/Validate.h" #include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" #include "src/core/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/core/helpers/MemoryHelpers.h" #include "src/core/utils/AssemblyUtils.h" +#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" #include -#include namespace arm_compute { namespace cpu { +using namespace arm_compute::experimental; + namespace { struct free_delete @@ -113,103 +116,27 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp return scheduling_hint; } -template -class FallbackTransform : public ITransformWeights -{ -public: - FallbackTransform() noexcept {}; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - FallbackTransform(const FallbackTransform &) = delete; - /** Default move constructor */ - FallbackTransform(FallbackTransform &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - FallbackTransform &operator=(const FallbackTransform &) = delete; - /** Default move assignment operator */ - FallbackTransform &operator=(FallbackTransform &&) = default; - void run() override - { - _output.allocator()->allocate(); - ARM_COMPUTE_ERROR_ON(_output.buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(_output.buffer(), _in1_ptr, _ldb, _multi_stride_b); - _reshape_run = true; - } - - void release() override - { - _output.allocator()->free(); - } - - ITensor *get_weights() override - { - return &_output; - } - - uint32_t uid() override - { - uint32_t id = (_B_pretranspose_size | 0x80000000); - return id; - } - - void configure(size_t B_pretranspose_size, unsigned int alignment) - { - _output.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment); - _B_pretranspose_size = B_pretranspose_size; - } - - void set_pretranspose(ITensor *tensor) - { - if(!_reshape_run) - { - _gemm_kernel_asm->set_pretransposed_B_data(tensor->buffer()); - } - } - - void set_args(const int ldb, const TypeInput *in1_ptr, const int multi_stride_b, std::shared_ptr> gemm_kernel_asm) - { - _ldb = ldb; - _in1_ptr = in1_ptr; - _multi_stride_b = multi_stride_b; - _gemm_kernel_asm = gemm_kernel_asm; - } - -private: - Tensor _output{}; - int _ldb{}; - const TypeInput *_in1_ptr{}; - int _multi_stride_b{}; - size_t _B_pretranspose_size{}; - std::shared_ptr> _gemm_kernel_asm{ nullptr }; -}; - /** Fallback in case ACL doesn't have a function */ template class Fallback : public CpuGemmAssemblyDispatch::IFallback { public: /** Destructor */ - ~Fallback() - { - if(_pretranspose && !(is_weight_managed())) - { - delete _pretranspose; - } - } + ~Fallback() = default; /** Initialise the functions's input and output. * - * @param[in] a Input tensor containing the Matrix A. - * @param[in] b Input tensor containing the Matrix B. - * @param[in] c Input tensor containing the Matrix C. - * @param[out] d Output tensor to store the result of matrix multiplication. - * @param[in] args Matrix multiplication information. - * @param[in] gemm_info GEMM meta-data - * @param[in] memory_group Memory group to be used by the function. - * @param[in] weights_manager Weights manager to be used by the function. - * @param[in] os Output stage meta-data. + * @param[in] a Input tensor containing the Matrix A. + * @param[in] b Input tensor containing the Matrix B. + * @param[in] c Input tensor containing the Matrix C. + * @param[out] d Output tensor to store the result of matrix multiplication. + * @param[in] args Matrix multiplication information. + * @param[in] gemm_info GEMM meta-data + * @param[in] os Output stage meta-data. */ void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {}); + const OutputStage &os = {}); /** Set requantization shifts to be used * @@ -231,16 +158,17 @@ public: // Inherited methods overridden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; - bool is_configured() const override; + bool is_configured() const override; + experimental::MemoryRequirements workspace() const override; private: - /** Allocate a workspace tensor. - * - * @param[in] workspace_size Size to allocate. - * @param[in] memory_group Tensor memory group. - * @param[in] alignment Workspace memory alignment. - */ - void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment); + enum AuxTensorIdx + { + AsmGemmWorkspace = 0, + Pretranspose, + Count + }; + /** Configure the indirect buffer * * @param[in] a Input tensor containing the Matrix A. @@ -256,18 +184,14 @@ private: std::shared_ptr> _gemm_kernel_asm{ nullptr }; /** Optimised Arm® Neon™ kernel */ std::unique_ptr _optimised_kernel{ nullptr }; - /** GEMM workspace */ - Tensor _workspace{}; - /** Pre-transpose tensor */ - ITensor *_pretranspose{ nullptr }; + /** Assembly GEMM workspace tensor info */ + TensorInfo _workspace_info{}; + /** Pre-transpose tensor info */ + TensorInfo _pretranspose_info{}; /** Prepared flag */ bool _is_prepared{ false }; /** GEMM meta-data */ AsmGemmInfo _gemm_info{}; - /** Weights manager */ - IWeightsManager *_weights_manager{ nullptr }; - /** Weights transform object */ - FallbackTransform _weights_transform{}; /** GEMM kernel description */ arm_gemm::KernelDescription _kernel_info{}; /** Per channel quantization shifts */ @@ -279,27 +203,9 @@ private: /** Indirect buffer */ std::unique_ptr _indirect_arg{}; std::unique_ptr _indirect_buf{}; - std::vector _indirect_pad{}; - arm_gemm::ConvolutionParameters _cp{}; - - bool is_weight_managed() - { - // TODO (COMPMID-4539): This function should do the following: - // _weights_manager && _weights_manager->are_weights_managed(_b) - // , where _b is the second Tensor that is used to be given to the configure(). - // Currently, however, weight manager is disabled to make this class stateless. - // This should be revisited in the future. - return false; - } - - void acquire_managed_weight() - { - // TODO (COMPMID-4539): This function should do the following: - // _pretranspose = _weights_manager->acquire(_b, &_weights_transform); - // , where _b is the second Tensor that is used to be given to the configure(). - // Currently, however, weight manager is disabled to make this class stateless. - _pretranspose = nullptr; - } + std::vector _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; + experimental::MemoryRequirements _aux_mem{ Count }; }; template @@ -439,12 +345,11 @@ void Fallback::configure_indirect(const ITen template void Fallback::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os) + const OutputStage &os) { ARM_COMPUTE_UNUSED(c); arm_gemm::GemmConfig gemm_cfg; - _kernel_info = arm_gemm::get_gemm_method(args, os); - _weights_manager = weights_manager; + _kernel_info = arm_gemm::get_gemm_method(args, os); if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED) { gemm_cfg.filter = _kernel_info.name; @@ -461,13 +366,10 @@ void Fallback::configure(const ITensorInfo * auto acl_gemm_wrapper = std::make_unique>(); ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr); acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter); - const size_t workspace_size = _gemm_kernel_asm->get_working_size(); - if(workspace_size > 0) - { - // Allocate workspace - const unsigned int alignment = 4096; - allocate_workspace(workspace_size, memory_group, alignment); - } + const size_t workspace_size = _gemm_kernel_asm->get_working_size(); + const unsigned int alignment = 4096; + _workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8); + _aux_mem[AsmGemmWorkspace] = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 @@ -487,16 +389,8 @@ void Fallback::configure(const ITensorInfo * // Forcing 128-byte alignment (required by 32-bit kernels) const unsigned int alignment = 128; const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); - if(is_weight_managed()) - { - _weights_transform.configure(B_pretranspose_size, alignment); - acquire_managed_weight(); - } - else - { - _pretranspose = new Tensor(); - static_cast(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment); - } + _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8); + _aux_mem[Pretranspose] = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); } // Handle indirect GEMM convolution @@ -509,10 +403,11 @@ void Fallback::configure(const ITensorInfo * template void Fallback::prepare(ITensorPack &tensors) { - auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); if(!_is_prepared) { + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); + // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. if(c && c->info()->data_type() == DataType::S32) { @@ -526,24 +421,9 @@ void Fallback::prepare(ITensorPack &tensors) const auto in1_ptr = reinterpret_cast(b->buffer() + b->info()->offset_first_element_in_bytes()); const int multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput); - if(is_weight_managed()) - { - _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm); - _weights_manager->run(b, &_weights_transform); - - // If we didn't run the reshape function, set the pretransposed buffer - if(!_weights_transform.is_reshape_run()) - { - _weights_transform.set_pretranspose(_pretranspose); - } - } - else - { - static_cast(_pretranspose)->allocator()->allocate(); - ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b); - b->mark_as_unused(); - } + CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); + ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); + _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b); } if(_gemm_info.method == AsmConvMethod::Indirect) @@ -556,18 +436,15 @@ void Fallback::prepare(ITensorPack &tensors) } template -void Fallback::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment) +bool Fallback::is_configured() const { - ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0"); - _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment); - memory_group.manage(&_workspace); - _workspace.allocator()->allocate(); + return _optimised_kernel != nullptr; } template -bool Fallback::is_configured() const +experimental::MemoryRequirements Fallback::workspace() const { - return _optimised_kernel != nullptr; + return _aux_mem; } template @@ -609,9 +486,10 @@ void Fallback::run(ITensorPack &tensors) const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type()); // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads - if(_workspace.buffer() != nullptr) + CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false); + if(workspace.get()->buffer() != nullptr) { - _gemm_kernel_asm->set_working_space(reinterpret_cast(_workspace.buffer())); + _gemm_kernel_asm->set_working_space(reinterpret_cast(workspace.get()->buffer())); const unsigned int split_dim = scheduling_hint.split_dimension(); const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); unsigned int num_threads = NEScheduler::get().num_threads(); @@ -656,9 +534,9 @@ void Fallback::run(ITensorPack &tensors) } template -void create_arm_gemm(std::unique_ptr &arm_gemm, MemoryGroup &memory_group, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, - IWeightsManager *weights_manager) +void create_arm_gemm(std::unique_ptr &arm_gemm, + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, + arm_gemm::Activation activation, const AsmGemmInfo &info) { Params p = extract_parameters(a, b, d, info); const CPUInfo &ci = NEScheduler::get().cpu_info(); @@ -668,14 +546,14 @@ void create_arm_gemm(std::unique_ptr &arm_ge // Create arm_gemm fallback auto fallback = std::make_unique>(); - fallback->configure(a, b, c, d, args, info, memory_group, weights_manager); + fallback->configure(a, b, c, d, args, info); arm_gemm = std::move(fallback); } template -void create_arm_gemm_quant(std::unique_ptr &arm_gemm, MemoryGroup &memory_group, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, - IWeightsManager *weights_manager) +void create_arm_gemm_quant(std::unique_ptr &arm_gemm, + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, + arm_gemm::Activation activation, const AsmGemmInfo &info) { ARM_COMPUTE_UNUSED(activation); Params p = extract_parameters(a, b, d, info); @@ -713,14 +591,13 @@ void create_arm_gemm_quant(std::unique_ptr & } // Configure fallback - fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info); + fallback->configure(a, b, c, d, args, info, gemm_requant_info); arm_gemm = std::move(fallback); } - } //namespace -CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch(std::shared_ptr memory_manager, IWeightsManager *weights_manager) - : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager) +CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() + : _arm_gemm(nullptr) { } @@ -775,40 +652,40 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo switch(a->data_type()) { case DataType::F32: - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, a, b, c, d, act, info); break; #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: if(d->data_type() == DataType::S32) { - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, a, b, c, d, act, info); } else { - create_arm_gemm_quant(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm_quant(_arm_gemm, a, b, c, d, act, info); } break; case DataType::S8: case DataType::QASYMM8_SIGNED: if(d->data_type() == DataType::S32) { - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, a, b, c, d, act, info); } else { - create_arm_gemm_quant(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm_quant(_arm_gemm, a, b, c, d, act, info); } break; #endif /* __aarch64__ */ #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) case DataType::BFLOAT16: - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, a, b, c, d, act, info); break; #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, a, b, c, d, act, info); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: @@ -829,10 +706,14 @@ bool CpuGemmAssemblyDispatch::is_configured() const void CpuGemmAssemblyDispatch::run(ITensorPack &tensors) { - MemoryGroupResourceScope scope_mg(_memory_group); - ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); _arm_gemm->run(tensors); } + +experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const +{ + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + return _arm_gemm->workspace(); +} } // namespace cpu } // namespace arm_compute -- cgit v1.2.1