From d1f54767fc9d6398a5eea38e639dd0ce3df8e5d8 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 19 Jul 2019 09:54:47 +0100 Subject: COMPMID-1979: Fuse Activation Function in CLGEMM - part 3 Fused beta*bias in in the old cl gemm kernels Fused activation function in the old cl gemm kernels Change-Id: I695fb9189e6d4792010abd256784624982d17d79 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/1587 Reviewed-by: Giuseppe Rossini Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 133 ++++++++++++++------- 1 file changed, 88 insertions(+), 45 deletions(-) (limited to 'src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp') diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index b3ea309c93..e793c65059 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -62,27 +62,34 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D"); - const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f; - const bool has_vec_c = input2 != nullptr && beta != 0.f; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(has_vec_c && !is_beta_one, "Adding input2 is only supported for beta equal to 1"); - if(!is_interleaved_transposed) { ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1)); - if(has_vec_c) + if(input2 != nullptr && !(helpers::float_ops::is_zero(beta))) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->num_dimensions() > 1, "input2 must be a 1D tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->dimension(0) != input1->dimension(0), "Length of Vector C must match the number of columns of matrix B"); + const unsigned int m = reshape_info.reinterpret_input_as_3d() ? input0->dimension(1) * input0->dimension(2) : input0->dimension(1); + const unsigned int n = input1->dimension(0); + const unsigned int input2_dim0 = input2->dimension(0); + const unsigned int input2_dim1 = input2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1); + if(reshape_info.broadcast_bias()) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix"); + } } } else { GEMMRHSMatrixInfo rhs_info; GEMMLHSMatrixInfo lhs_info; - const int m = reshape_info.m(); - const int n = reshape_info.n(); + const auto m = static_cast(reshape_info.m()); + const auto n = static_cast(reshape_info.n()); const int k = reshape_info.k(); const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); @@ -114,10 +121,20 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); - if(has_vec_c) + if(input2 != nullptr && !(helpers::float_ops::is_zero(beta))) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->num_dimensions() > 1, "input2 must be a 1D tensor"); + const unsigned int input2_dim0 = input2->dimension(0); + const unsigned int input2_dim1 = input2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1); + if(reshape_info.broadcast_bias()) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix"); + } } } @@ -145,7 +162,6 @@ inline std::pair validate_and_configure_window(ITensorInfo *inpu unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0); - const bool has_vec_c = input2 != nullptr && beta != 0.f; // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. @@ -194,12 +210,23 @@ inline std::pair validate_and_configure_window(ITensorInfo *inpu ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x), output->dimension(1) + bottom_pad); - window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor - if(has_vec_c) + if(input2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y; + + AccessWindowStatic input2_access(input2, 0, 0, + ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x), + ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y)); + + window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor + } + else { - AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win, input2_access); + window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor } output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape())); @@ -232,12 +259,23 @@ inline std::pair validate_and_configure_window(ITensorInfo *inpu ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x), output->dimension(1) + bottom_pad); - window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor - if(has_vec_c) + if(input2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y; + + AccessWindowStatic input2_access(input2, 0, 0, + ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x), + ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y)); + + window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor + } + else { - AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win, input2_access); + window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor } Coordinates coord; @@ -257,12 +295,13 @@ inline std::pair validate_and_configure_window(ITensorInfo *inpu } // namespace CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() - : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _has_vec_c(false) + : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _add_bias(false), + _broadcast_bias(false) { } void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, - bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision) + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); @@ -272,10 +311,12 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _input0 = input0; _input1 = input1; - _input2 = input2; + _input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2; _output = output; _reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0); + _add_bias = _input2 != nullptr; + _broadcast_bias = reshape_info.broadcast_bias(); // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. @@ -306,23 +347,21 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen // Create build options CLBuildOptions build_opts; - // Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications. - if(!(helpers::float_ops::is_one(alpha))) - { - build_opts.add_option("-DALPHA=" + float_to_string_with_full_precision(alpha)); - } + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); + build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS"); build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1))); build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2))); - - // Do not slide matrix B if _slide_matrix_b = false build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); + build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation()))); + build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a())); + build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b())); const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST; - _has_vec_c = input2 != nullptr && beta != 0.f; - std::string kernel_name; if(is_interleaved_transposed) { @@ -386,15 +425,14 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x())); } - // Configure matrix C addition if necessary - build_opts.add_option_if(_has_vec_c, "-DADD_VEC_C"); - // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); // Set config_id for enabling LWS tuning _config_id = "gemm_"; _config_id += (is_interleaved_transposed ? "reshaped_" : ""); + _config_id += (_add_bias ? "add_bias_" : ""); + _config_id += (_broadcast_bias ? "broadcast_bias_" : ""); _config_id += (fp_mixed_precision ? "fp_mixed_" : ""); _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); @@ -412,11 +450,12 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen } Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, - bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision) + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info) { // Note: num_elements_processed will be set in validate_and_configure_window() ElementsProcessed num_elements_processed{}; ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(activation_info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), @@ -449,12 +488,12 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - const unsigned int num_arguments_vec_c = (_has_vec_c) ? num_arguments_per_1D_tensor() : 0; + const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0; if(_reinterpret_input_as_3d) { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_vec_c; + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias; const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom; _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); } @@ -462,7 +501,7 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que if(_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_vec_c; + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias; const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); } @@ -480,13 +519,17 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que unsigned int idx = 0; add_2D_tensor_argument(idx, _input0, slice); add_2D_tensor_argument(idx, _input1, slice_b); - if(_has_vec_c) + if(_add_bias) { - add_1D_tensor_argument(idx, _input2, slice); + add_2D_tensor_argument(idx, _input2, slice); } add_2D_tensor_argument(idx, _output, slice); _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); + if(_add_bias) + { + _kernel.setArg(idx++, static_cast(_input2->info()->strides_in_bytes()[2])); + } _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); enqueue(queue, *this, slice, lws_hint()); } -- cgit v1.2.1