From 9ae06d4986bc3055f7786c1097b465bd321cf8eb Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 22 Oct 2020 16:37:12 +0100 Subject: COMPMID-3925: Dispatch CLGEMM with no padding y requirement - Add has_pad_y flag in GEMMKernelInfo - Skip reinterpret as 3D in CLGEMMMatrixMultiplyReshapedOnlyRHSKernel if has_pad_y = false - Add test to validate CLGEMMMatrixMultiplyReshapedOnlyRHSkernel with had_pad_y = false/true - Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel to run with has_pad_y = false/true in CLGEMM - Check if the lhs/dst tensors have pad y. If not, run CLGEMMMatrixMultiplyReshapedOnlyRHSKernel without padding requirement Change-Id: I68bb43389789736d676b899ac7c77fd9138babaf Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4248 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp | 66 +++++++++++----------- 1 file changed, 34 insertions(+), 32 deletions(-) (limited to 'src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp') diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp index 912c763ed5..68f761b9e7 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp @@ -133,7 +133,8 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) + // This approach should only be used when the input/output tensors have pad on the y direction + if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) { reinterpret_output_as_3d = false; } @@ -159,16 +160,6 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowStatic input0_access(input0, 0, 0, - input0->dimension(0), - input0->dimension(1)); - AccessWindowStatic input1_access(input1, 0, 0, - input1->dimension(0), - input1->dimension(1)); - AccessWindowStatic output_access(output, 0, 0, - output->dimension(0), - output->dimension(1)); - if(input2 != nullptr) { const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; @@ -177,17 +168,9 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x), input2->dimension(1)); - window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor - } - else - { - window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor + window_changed = update_window_and_padding(win, input2_access); } - output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape())); - // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS Window collapsed = win; @@ -201,7 +184,7 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel() : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false), - _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false) + _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false), _has_pad_y(false) { } @@ -232,10 +215,13 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext _add_bias = _input2 != nullptr; _broadcast_bias = gemm_info.broadcast_bias; _export_to_cl_image = rhs_info.export_to_cl_image; + _has_pad_y = gemm_info.has_pad_y; + + auto padding_info = get_padding_info({ input0, input1, output }); // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -257,6 +243,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1); + // These variables are used only if gemm_info.has_pad_y == true const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1); const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2); @@ -274,11 +261,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); @@ -296,6 +279,13 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + if(_has_pad_y) + { + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + } std::string kernel_name("gemm_mm_reshaped_only_rhs_"); kernel_name += rhs_info.transpose ? "t" : "nt"; @@ -307,6 +297,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext // Set config_id for enabling LWS tuning _config_id = kernel_name; _config_id += "_"; + _config_id += (_has_pad_y ? "" : "no_pad_y_"); _config_id += (_add_bias ? "add_bias_" : ""); _config_id += (_broadcast_bias ? "broadcast_bias_" : ""); _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); @@ -331,6 +322,8 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext _config_id += support::cpp11::to_string(rhs_info.h0); _config_id += "_"; _config_id += support::cpp11::to_string(rhs_info.interleave); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } Status CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, @@ -363,15 +356,24 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); } + const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y? 3u : 2u; + const size_t rhs_idx_batch_size = 2u; + const size_t bia_idx_batch_size = 2u; + const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y? 3u : 2u; + Window slice = window.first_slice_window_3D(); Window slice_matrix_b = slice; slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + // Get cross plane pads const unsigned int total_cross_plane_pad_lhs = _input0->info()->padding().top + _input0->info()->padding().bottom; const unsigned int total_cross_plane_pad_out = _output->info()->padding().top + _output->info()->padding().bottom; + // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor + ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0))); + cl::Image2D input1_image2d; if(_export_to_cl_image) @@ -414,28 +416,28 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co add_2D_tensor_argument(idx, _output, slice); // LHS stride_z - _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[lhs_idx_batch_size])); // RHS stride_z (not used if _export_to_cl_image == true) - _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[rhs_idx_batch_size])); // Bias stride_z (if _add_bias == true) if(_add_bias) { - _kernel.setArg(idx++, static_cast(_input2->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_input2->info()->strides_in_bytes()[bia_idx_batch_size])); } // Output stride_z - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[out_idx_batch_size])); // Cross-plan padding (if _reinterpret_input_as_3d = true) - if(_reinterpret_input_as_3d) + if(_reinterpret_input_as_3d && _has_pad_y) { _kernel.setArg(idx++, static_cast(total_cross_plane_pad_lhs)); } // Cross-plan padding (if _reinterpret_output_as_3d = true) - if(_reinterpret_output_as_3d) + if(_reinterpret_output_as_3d && _has_pad_y) { _kernel.setArg(idx++, static_cast(total_cross_plane_pad_out)); } -- cgit v1.2.1