From 173ba9bbb19ea83f951318d9989e440768b4de8f Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Tue, 23 Jun 2020 17:25:43 +0100 Subject: COMPMID-3373: Async support to NEArithmetic* kernels/functions (Pt. 1) Added support on NEArithmeticAddition and NEArithmeticSubtraction Signed-off-by: Michalis Spyrou Change-Id: Ifa805f8455ef6eff1ee627752dc1c7fe9740ec47 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3451 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../NEON/kernels/NEArithmeticAdditionKernel.cpp | 26 +++++------- .../NEON/kernels/NEArithmeticSubtractionKernel.cpp | 44 +++++++++----------- .../NEON/functions/NEArithmeticAddition.cpp | 46 ++++++++++++++++++++- .../NEON/functions/NEArithmeticSubtraction.cpp | 47 +++++++++++++++++++++- src/runtime/NEON/functions/NEGEMM.cpp | 8 ++-- src/runtime/NEON/functions/NELSTMLayer.cpp | 16 ++++---- src/runtime/NEON/functions/NEQLSTMLayer.cpp | 45 +++++++++++---------- src/runtime/NEON/functions/NERNNLayer.cpp | 8 ++-- 8 files changed, 161 insertions(+), 79 deletions(-) (limited to 'src') diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp index 3878c764a6..1459f7f250 100644 --- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp @@ -853,7 +853,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, return Status{}; } -std::pair validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) +std::pair validate_and_configure_window(const ITensorInfo &input1, const ITensorInfo &input2, ITensorInfo &output) { const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2); const TensorShape &out_shape = broadcast_pair.first; @@ -904,17 +904,17 @@ std::pair validate_and_configure_window(ITensorInfo &input1, ITe } // namespace NEArithmeticAdditionKernel::NEArithmeticAdditionKernel() - : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy() + : _func(nullptr), _policy() { } -void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy) +void NEArithmeticAdditionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy)); // Configure kernel window - auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info()); + auto win_config = validate_and_configure_window(*input1, *input2, *output); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); static std::map map_function = @@ -945,16 +945,13 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ }; - _input1 = input1; - _input2 = input2; - _output = output; _policy = policy; std::string function_to_call("add_"); function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_"; - function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; - function_to_call += string_from_data_type(input2->info()->data_type()) + "_"; - function_to_call += string_from_data_type(output->info()->data_type()); + function_to_call += string_from_data_type(input1->data_type()) + "_"; + function_to_call += string_from_data_type(input2->data_type()) + "_"; + function_to_call += string_from_data_type(output->data_type()); auto it = map_function.find(function_to_call); @@ -976,13 +973,12 @@ Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITe return Status{}; } -void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info) +void NEArithmeticAdditionKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (*_func)(_input1, _input2, _output, _policy, window); + // Dispatch kernel + (*_func)(inputs.at(TensorType::ACL_SRC_0), inputs.at(TensorType::ACL_SRC_1), outputs.at(TensorType::ACL_DST), _policy, window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp index 2b3fce3fea..2097d761a7 100644 --- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp @@ -719,35 +719,32 @@ inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &i } // namespace NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel() - : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy(ConvertPolicy::WRAP) + : _func(nullptr), _policy(ConvertPolicy::WRAP) { } -void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy) +void NEArithmeticSubtractionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy)); - _input1 = input1; - _input2 = input2; - _output = output; _policy = policy; - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); const TensorShape &out_shape = broadcast_pair.first; const ValidRegion &valid_region = broadcast_pair.second; // Auto initialize output if not initialized - set_shape_if_empty(*output->info(), out_shape); + set_shape_if_empty(*output, out_shape); - switch(input1->info()->data_type()) + switch(input1->data_type()) { case DataType::U8: - if(input2->info()->data_type() == DataType::U8 && output->info()->data_type() == DataType::U8) + if(input2->data_type() == DataType::U8 && output->data_type() == DataType::U8) { _func = &sub_same; } - else if(input2->info()->data_type() == DataType::U8 && output->info()->data_type() == DataType::S16) + else if(input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) { _func = &sub_U8_U8_S16; } @@ -758,14 +755,14 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens break; case DataType::QASYMM8: _func = &sub_quantized; - set_data_type_if_unknown(*output->info(), DataType::QASYMM8); + set_data_type_if_unknown(*output, DataType::QASYMM8); break; case DataType::QASYMM8_SIGNED: _func = &sub_quantized; - set_data_type_if_unknown(*output->info(), DataType::QASYMM8_SIGNED); + set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED); break; case DataType::S16: - if(input2->info()->data_type() == DataType::U8) + if(input2->data_type() == DataType::U8) { _func = &sub_S16_U8_S16; } @@ -773,21 +770,21 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens { _func = &sub_same; } - set_format_if_unknown(*output->info(), Format::S16); + set_format_if_unknown(*output, Format::S16); break; case DataType::QSYMM16: _func = &sub_QSYMM16_QSYMM16_QSYMM16; - set_data_type_if_unknown(*output->info(), DataType::QSYMM16); + set_data_type_if_unknown(*output, DataType::QSYMM16); break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: _func = &sub_same; - set_format_if_unknown(*output->info(), Format::F16); + set_format_if_unknown(*output, Format::F16); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::F32: _func = &sub_same; - set_format_if_unknown(*output->info(), Format::F32); + set_format_if_unknown(*output, Format::F32); break; default: _func = nullptr; @@ -795,8 +792,8 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens // NEArithmeticSubtractionKernel doesn't need padding so update_window_and_padding() can be skipped Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output->info()->set_valid_region(valid_region); + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(valid_region); Window win = calculate_max_window(valid_region, Steps()); INEKernel::configure(win); @@ -810,13 +807,12 @@ Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const return Status{}; } -void NEArithmeticSubtractionKernel::run(const Window &window, const ThreadInfo &info) +void NEArithmeticSubtractionKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (*_func)(_input1, _input2, _output, window, (_policy == ConvertPolicy::SATURATE)); + // Dispatch kernel + (*_func)(inputs.at(TensorType::ACL_SRC_0), inputs.at(TensorType::ACL_SRC_1), outputs.at(TensorType::ACL_DST), window, (_policy == ConvertPolicy::SATURATE)); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp index 06c71db1bd..3a2848c3a7 100644 --- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp +++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp @@ -31,7 +31,9 @@ namespace arm_compute { -void NEArithmeticAddition::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +namespace experimental +{ +void NEArithmeticAddition::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); auto k = arm_compute::support::cpp14::make_unique(); @@ -43,4 +45,46 @@ Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorIn ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return NEArithmeticAdditionKernel::validate(input1, input2, output, policy); } +MemoryRequirements NEArithmeticAddition::workspace() const +{ + return MemoryRequirements{}; +} +} // namespace experimental + +struct NEArithmeticAddition::Impl +{ + const ITensor *src_0{ nullptr }; + const ITensor *src_1{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; +}; + +NEArithmeticAddition::NEArithmeticAddition() + : _impl(support::cpp14::make_unique()) +{ +} +NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; +NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default; +NEArithmeticAddition::~NEArithmeticAddition() = default; + +Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + return experimental::NEArithmeticAddition::validate(input1, input2, output, policy, act_info); +} + +void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = arm_compute::support::cpp14::make_unique(); + _impl->op->configure(input1->info(), input2->info(), output->info(), policy, act_info); +} + +void NEArithmeticAddition::run() +{ + const InputTensorMap src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } }; + const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } }; + _impl->op->run(src, dst, {}); +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp index 20f930a286..043250ca68 100644 --- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp +++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp @@ -31,7 +31,9 @@ namespace arm_compute { -void NEArithmeticSubtraction::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +namespace experimental +{ +void NEArithmeticSubtraction::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); auto k = arm_compute::support::cpp14::make_unique(); @@ -44,4 +46,47 @@ Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITenso ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy); } + +MemoryRequirements NEArithmeticSubtraction::workspace() const +{ + return MemoryRequirements{}; +} +} // namespace experimental + +struct NEArithmeticSubtraction::Impl +{ + const ITensor *src_0{ nullptr }; + const ITensor *src_1{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; +}; + +NEArithmeticSubtraction::NEArithmeticSubtraction() + : _impl(support::cpp14::make_unique()) +{ +} +NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; +NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default; +NEArithmeticSubtraction::~NEArithmeticSubtraction() = default; + +Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + return experimental::NEArithmeticSubtraction::validate(input1, input2, output, policy, act_info); +} + +void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + _impl->src_0 = input1; + _impl->src_1 = input2; + _impl->dst = output; + _impl->op = arm_compute::support::cpp14::make_unique(); + _impl->op->configure(input1->info(), input2->info(), output->info(), policy, act_info); +} + +void NEArithmeticSubtraction::run() +{ + const InputTensorMap src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } }; + const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } }; + _impl->op->run(src, dst, {}); +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 89dd4a15d0..5fc12e585a 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -43,7 +43,7 @@ namespace arm_compute { NEGEMM::NEGEMM(std::shared_ptr memory_manager, IWeightsManager *weights_manager) : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(), - _alpha_scale_func(nullptr), _add_bias_kernel(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false), + _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false), _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -141,7 +141,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe if(_run_bias_addition) { - _add_bias_kernel.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); + _add_bias.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); _tmp_d.allocator()->allocate(); } } @@ -258,7 +258,7 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso if(c != nullptr && gemm_info.reshape_b_only_on_first_run()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE)); } } @@ -311,7 +311,7 @@ void NEGEMM::run() // Run bias addition kernel if(_run_bias_addition) { - NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY); + _add_bias.run(); } } diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp index 0a111363e3..d8c684bf15 100644 --- a/src/runtime/NEON/functions/NELSTMLayer.cpp +++ b/src/runtime/NEON/functions/NELSTMLayer.cpp @@ -512,7 +512,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state @@ -610,7 +610,7 @@ void NELSTMLayer::run() { _mean_std_norm_forget_gate.run(); NEScheduler::get().schedule(&_pixelwise_mul_forget_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_forget_gate_bias, Window::DimY); + _accum_forget_gate_bias.run(); } _activation_forget_gate.run(); @@ -624,7 +624,7 @@ void NELSTMLayer::run() { std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); } - NEScheduler::get().schedule(&_subtract_input_gate, Window::DimY); + _subtract_input_gate.run(); } else { @@ -640,7 +640,7 @@ void NELSTMLayer::run() { _mean_std_norm_input_gate.run(); NEScheduler::get().schedule(&_pixelwise_mul_input_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_input_gate_bias, Window::DimY); + _accum_input_gate_bias.run(); } _activation_input_gate.run(); } @@ -648,17 +648,17 @@ void NELSTMLayer::run() _fully_connected_cell_state.run(); NEScheduler::get().schedule(&_transpose_cell_state, Window::DimY); _gemm_cell_state1.run(); - NEScheduler::get().schedule(&_accum_cell_state1, Window::DimY); + _accum_cell_state1.run(); if(_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); NEScheduler::get().schedule(&_pixelwise_mul_cell_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_cell_gate_bias, Window::DimY); + _accum_cell_gate_bias.run(); } _activation_cell_state.run(); NEScheduler::get().schedule(&_pixelwise_mul_cell_state1, Window::DimY); NEScheduler::get().schedule(&_pixelwise_mul_cell_state2, Window::DimY); - NEScheduler::get().schedule(&_accum_cell_state2, Window::DimY); + _accum_cell_state2.run(); if(_perform_cell_clipping) { @@ -675,7 +675,7 @@ void NELSTMLayer::run() { _mean_std_norm_output_gate.run(); NEScheduler::get().schedule(&_pixelwise_mul_output_gate_coeff, Window::DimY); - NEScheduler::get().schedule(&_accum_output_gate_bias, Window::DimY); + _accum_output_gate_bias.run(); } _activation_output.run(); diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp index a22c669ca7..6eb1844a1f 100644 --- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp +++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp @@ -619,7 +619,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, if(lstm_params.projection_bias() != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE)); } } @@ -662,7 +662,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); if(lstm_params.has_peephole_opt()) { @@ -672,7 +672,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); } if(has_layer_norm) @@ -697,7 +697,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); if(has_layer_norm) { @@ -714,7 +714,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, if(lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); } else { @@ -733,7 +733,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); if(lstm_params.has_peephole_opt()) { @@ -742,7 +742,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); } if(has_layer_norm) @@ -757,7 +757,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Cell. ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); if(quantized_cell_clip > 0) { ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, @@ -772,7 +772,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); if(lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); @@ -782,7 +782,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); } if(has_layer_norm) @@ -837,7 +837,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); if(projection_tensor_copy_required) { @@ -893,13 +893,13 @@ void NEQLSTMLayer::run() _mm_recurrent_to_forget.run(); _recurrent_to_forget_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_forget, Window::DimY); + _accumulate_input_recurrent_forget.run(); if(_has_peephole) { NEScheduler::get().schedule(&_pixelwise_mul_cell_to_forget, Window::DimY); _cell_to_forget_outstage.run(); - NEScheduler::get().schedule(&_accumulate_cell_forget, Window::DimY); + _accumulate_cell_forget.run(); } if(_has_layer_norm) @@ -915,7 +915,7 @@ void NEQLSTMLayer::run() _mm_recurrent_to_cell.run(); _recurrent_to_cell_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_modulation, Window::DimY); + _accumulate_input_recurrent_modulation.run(); if(_has_layer_norm) { @@ -927,7 +927,7 @@ void NEQLSTMLayer::run() // Input gate if(_has_cifg) { - NEScheduler::get().schedule(&_input_gate_sub, Window::DimY); + _input_gate_sub.run(); } else { @@ -935,13 +935,13 @@ void NEQLSTMLayer::run() _input_to_input_outstage.run(); _mm_recurrent_to_input.run(); _recurrent_to_input_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_input, Window::DimY); + _accumulate_input_recurrent_input.run(); if(_has_peephole) { NEScheduler::get().schedule(&_pixelwise_mul_cell_to_input, Window::DimY); _cell_to_input_outstage.run(); - NEScheduler::get().schedule(&_accumulate_cell_input, Window::DimY); + _accumulate_cell_input.run(); } if(_has_layer_norm) @@ -955,7 +955,8 @@ void NEQLSTMLayer::run() // Cell. NEScheduler::get().schedule(&_pixelwise_mul_forget_cell, Window::DimY); NEScheduler::get().schedule(&_pixelwise_mul_input_cell, Window::DimY); - NEScheduler::get().schedule(&_add_forget_cell, Window::DimY); + _add_forget_cell.run(); + if(_has_cell_clipping) { _cell_clip.run(); @@ -966,12 +967,12 @@ void NEQLSTMLayer::run() _input_to_output_outstage.run(); _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); - NEScheduler::get().schedule(&_accumulate_input_recurrent_output, Window::DimY); + _accumulate_input_recurrent_output.run(); if(_has_peephole) { NEScheduler::get().schedule(&_pixelwise_mul_cell_to_output, Window::DimY); _cell_to_output_outstage.run(); - NEScheduler::get().schedule(&_accumulate_cell_to_output, Window::DimY); + _accumulate_cell_to_output.run(); } if(_has_layer_norm) @@ -997,7 +998,7 @@ void NEQLSTMLayer::run() _projection_output_to_accumulate_copy.run(); } - NEScheduler::get().schedule(&_accumulate_projection, Window::DimY); + _accumulate_projection.run(); if(_projection_tensor_copy_required) { @@ -1077,7 +1078,7 @@ void NEQLSTMLayer::prepare() NEScheduler::get().schedule(&_projection_reduction, Window::DimY); if(_projection_bias != nullptr) { - NEScheduler::get().schedule(&_projection_bias_add, Window::DimY); + _projection_bias_add.run(); _projection_bias->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp index 4a15777be9..19b84e7fb8 100644 --- a/src/runtime/NEON/functions/NERNNLayer.cpp +++ b/src/runtime/NEON/functions/NERNNLayer.cpp @@ -34,7 +34,7 @@ namespace arm_compute { NERNNLayer::NERNNLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(), _is_prepared(false) { } @@ -59,7 +59,7 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; @@ -90,7 +90,7 @@ void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const I _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); _memory_group.manage(&_add_output); - _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE); + _add_f.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE); _fully_connected_out.allocator()->allocate(); _gemm_output.allocator()->allocate(); @@ -111,7 +111,7 @@ void NERNNLayer::run() _gemm_state_f.run(); - NEScheduler::get().schedule(&_add_kernel, Window::DimY); + _add_f.run(); _activation.run(); // copy hidden out to output -- cgit v1.2.1