From 4a578b923ed000c67fe0bc1433f945aea634ca9c Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 25 Jun 2021 12:13:49 +0100 Subject: Port the ClGemmLowp kernels to the new API Ported kernels: - CLGEMMLowpMatrixMultiplyNativeKernel - CLGEMMLowpMatrixMultiplyReshapedKernel - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - CLGEMMLowpOffsetContributionKernel - CLGEMMLowpOffsetContributionOutputStageKernel - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - CLGEMMLowpQuantizeDownInt32ScaleKernel Signed-off-by: Georgios Pinitas Change-Id: I9d5a744d6a2dd2f2726fdfb291bad000b6970de2 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5870 Reviewed-by: Michele Di Giorgio Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- src/runtime/CL/functions/CLQLSTMLayer.cpp | 100 ++++++++++++++++++------------ 1 file changed, 59 insertions(+), 41 deletions(-) (limited to 'src/runtime/CL/functions/CLQLSTMLayer.cpp') diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp index fcf5b9d2a4..5df895a91c 100644 --- a/src/runtime/CL/functions/CLQLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp @@ -31,17 +31,14 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" +#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h" #include "src/core/helpers/WindowHelpers.h" namespace arm_compute { using namespace arm_compute::utils::info_helpers; +using namespace arm_compute::opencl::kernels; namespace { Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, @@ -93,15 +90,15 @@ void CLQLSTMLayer::TensorCopyKernel::run() } CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr memory_manager) - : _input_to_input_reduction(std::make_unique()), - _recurrent_to_input_reduction(std::make_unique()), - _input_to_forget_reduction(std::make_unique()), - _recurrent_to_forget_reduction(std::make_unique()), - _input_to_cell_reduction(std::make_unique()), - _recurrent_to_cell_reduction(std::make_unique()), - _input_to_output_reduction(std::make_unique()), - _recurrent_to_output_reduction(std::make_unique()), - _projection_reduction(std::make_unique()), + : _input_to_input_reduction(std::make_unique()), + _recurrent_to_input_reduction(std::make_unique()), + _input_to_forget_reduction(std::make_unique()), + _recurrent_to_forget_reduction(std::make_unique()), + _input_to_cell_reduction(std::make_unique()), + _recurrent_to_cell_reduction(std::make_unique()), + _input_to_output_reduction(std::make_unique()), + _recurrent_to_output_reduction(std::make_unique()), + _projection_reduction(std::make_unique()), _layer_norms(), _copy_output() { @@ -247,18 +244,22 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); - _input_to_input_reduction->configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, + -qoutput_state_in.offset, true)); } - _input_to_forget_reduction->configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction->configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, + -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, + true)); + _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, + -qoutput_state_in.offset, true)); if(_has_projection) { - _projection_reduction->configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); if(_projection_bias != nullptr) { _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); @@ -677,19 +678,19 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input, const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); if(!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); if(lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); if(lstm_params.projection_bias() != nullptr) @@ -1128,8 +1129,12 @@ void CLQLSTMLayer::prepare() { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_input_to_input_reduction); - CLScheduler::get().enqueue(*_recurrent_to_input_reduction); + + ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false); + + ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1144,17 +1149,30 @@ void CLQLSTMLayer::prepare() _recurrent_to_cell_eff_bias.allocator()->allocate(); _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_input_to_forget_reduction); - CLScheduler::get().enqueue(*_recurrent_to_forget_reduction); - CLScheduler::get().enqueue(*_input_to_cell_reduction); - CLScheduler::get().enqueue(*_recurrent_to_cell_reduction); - CLScheduler::get().enqueue(*_input_to_output_reduction); - CLScheduler::get().enqueue(*_recurrent_to_output_reduction); + + ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false); + + ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false); + + ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false); + + ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false); + + ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } }; + CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false); + + ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } }; + CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false); if(_has_projection) { _projection_eff_bias.allocator()->allocate(); - CLScheduler::get().enqueue(*_projection_reduction); + ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } }; + CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false); if(_projection_bias != nullptr) { _projection_bias_add.run(); -- cgit v1.2.1