diff options
Diffstat (limited to 'src/gpu')
-rw-r--r-- | src/gpu/cl/ClKernelLibrary.cpp | 12 | ||||
-rw-r--r-- | src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp | 23 | ||||
-rw-r--r-- | src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h | 3 | ||||
-rw-r--r-- | src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp | 22 | ||||
-rw-r--r-- | src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h | 4 | ||||
-rw-r--r-- | src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp | 15 | ||||
-rw-r--r-- | src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h | 3 | ||||
-rw-r--r-- | src/gpu/cl/operators/ClGemm.cpp | 8 | ||||
-rw-r--r-- | src/gpu/cl/operators/ClGemm.h | 1 |
9 files changed, 59 insertions, 32 deletions
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp index c47cf8ef11..f87b226a64 100644 --- a/src/gpu/cl/ClKernelLibrary.cpp +++ b/src/gpu/cl/ClKernelLibrary.cpp @@ -290,10 +290,10 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map = { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" }, { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" }, { "gemm_lc_vm_f32", "common/gemm.cl" }, - { "gemm_reshape_lhs_matrix_nt", "common/gemm.cl" }, - { "gemm_reshape_lhs_matrix_t", "common/gemm.cl" }, - { "gemm_reshape_rhs_matrix_nt", "common/gemm.cl" }, - { "gemm_reshape_rhs_matrix_t", "common/gemm.cl" }, + { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" }, + { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" }, + { "gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl" }, + { "gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl" }, { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" }, { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" }, { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" }, @@ -590,6 +590,10 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = #include "./cl_kernels/common/gemm.clembed" }, { + "common/gemm_utils.cl", +#include "./cl_kernels/common/gemm_utils.clembed" + }, + { "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl", #include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.clembed" }, diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp index af794354c3..05988997e7 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp @@ -275,6 +275,9 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. // NOTE: This might have implications on heuristics and performance const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); + _m = internal_m; + _n = gemm_info.n; + _k = gemm_info.k; // Create build options CLBuildOptions build_opts; @@ -289,9 +292,6 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); @@ -312,6 +312,9 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile std::string kernel_name("gemm_mm_native"); post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops); + // A macro guard to compile ONLY the kernel of interest + build_opts.add_option("-D" + upper_string(kernel_name)); + // Create kernel _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); @@ -392,11 +395,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window unsigned int idx0; if(_add_bias) { - idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (4 + _num_post_op_args); + idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (7 + _num_post_op_args); } else { - idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (3 + _num_post_op_args); + idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (6 + _num_post_op_args); } const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); @@ -408,11 +411,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window unsigned int idx0; if(_add_bias) { - idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args; + idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args; } else { - idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args; + idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args; } const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); @@ -455,6 +458,12 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i))); _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2])); } + + // Pass m, n and k at runtime + _kernel.setArg<cl_int>(idx++, _m); + _kernel.setArg<cl_int>(idx++, _n); + _kernel.setArg<cl_int>(idx++, _k); + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); } while(window.slide_window_slice_3D(slice)); diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h index 415eb7bf3b..e478df727a 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h @@ -81,6 +81,9 @@ private: bool _reinterpret_output_as_3d{ false }; bool _use_dummy_work_items{ false }; bool _add_bias{ false }; + signed int _m{ 1 }; + signed int _n{ 1 }; + signed int _k{ 1 }; unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments }; } // namespace kernels diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp index 64e99332fd..6a450b652b 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp @@ -201,7 +201,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); _add_bias = src2 != nullptr; _export_to_cl_image = rhs_info.export_to_cl_image; - _k = gemm_info.k; _num_post_op_args = gemm_info.post_ops.total_num_arguments(); // Check if we need to slide the matrix B @@ -230,6 +229,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi const unsigned int partial_store_m0 = internal_m % lhs_info.m0; const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; + _m = gemm_info.m; + _n = gemm_info.n; + _k = gemm_info.k; // Create build options CLBuildOptions build_opts; @@ -250,9 +252,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type))); - build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m)); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); @@ -278,6 +277,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops); + // A macro guard to compile ONLY the kernel of interest + build_opts.add_option("-D" + upper_string(kernel_name)); + // Create kernel _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); @@ -399,9 +401,6 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind add_2D_tensor_argument(idx, post_op_arg, slice); } - // K dimension (not used if _export_to_cl_image == true) - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k)); - // LHS stride_z _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); @@ -429,6 +428,13 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad)); } + // Pass m, n and k at runtime + _kernel.setArg<cl_int>(idx++, _m); + _kernel.setArg<cl_int>(idx++, _n); + + // K dimension (not used if _export_to_cl_image == true) + _kernel.setArg<cl_int>(idx++, _k); + // Dispatch kernel enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); } @@ -436,4 +442,4 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind } } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h index 09160ec0d1..2d668b91a3 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h @@ -105,7 +105,9 @@ private: bool _use_dummy_work_items{ false }; bool _add_bias{ false }; bool _export_to_cl_image{ false }; - unsigned int _k{ 1 }; + signed int _m{ 1 }; + signed int _n{ 1 }; + signed int _k{ 1 }; unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments }; } // namespace kernels diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp index aa806978ef..29f9180bf4 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -240,7 +240,9 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. const unsigned int partial_store_m0 = internal_m % internal_m0; const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; - + _m = internal_m; + _n = gemm_info.n; + _k = gemm_info.k; // Create build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); @@ -253,9 +255,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); - build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); @@ -286,6 +285,9 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops); + // A macro guard to compile ONLY the kernel of interest + build_opts.add_option("-D" + upper_string(kernel_name)); + // Create kernel _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); @@ -447,6 +449,11 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out)); } + // Pass m, n and k at runtime as signed ints, to ensure results of any subractions they could be operand in, would still be signed. + _kernel.setArg<cl_int>(idx++, _m); + _kernel.setArg<cl_int>(idx++, _n); + _kernel.setArg<cl_int>(idx++, _k); + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); } while(window.slide_window_slice_3D(slice)); diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h index a8f0c4c3a0..ec5878d5cc 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h @@ -97,6 +97,9 @@ private: bool _add_bias{ false }; bool _export_to_cl_image{ false }; bool _has_pad_y{ false }; + signed int _m{ 1 }; + signed int _n{ 1 }; + signed int _k{ 1 }; unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments }; } // namespace kernels diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp index 50ecb214e3..555738531a 100644 --- a/src/gpu/cl/operators/ClGemm.cpp +++ b/src/gpu/cl/operators/ClGemm.cpp @@ -191,7 +191,6 @@ ClGemm::ClGemm() _mm_native_kernel(std::make_unique<ClGemmMatrixMultiplyNativeKernel>()), _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()), _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()), - _mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()), _tmp_a(), _tmp_b(), _reshape_b_only_on_first_run(false), @@ -303,7 +302,6 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context // Set the target for the kernels _mm_reshaped_only_rhs_kernel->set_target(gpu_target); - _mm_reshaped_only_rhs_fallback_kernel->set_target(gpu_target); GEMMLHSMatrixInfo lhs_info{}; GEMMRHSMatrixInfo rhs_info{}; @@ -322,10 +320,6 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context kernel_info.has_pad_y = false; _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); - // Configure matrix multiply kernel with y padding support - kernel_info.has_pad_y = true; - _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); - // Request memory for RHS reshape matrix _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); } @@ -625,7 +619,7 @@ void ClGemm::run(ITensorPack &tensors) if(has_pad_y) { - CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true); + ARM_COMPUTE_ERROR_ON(has_pad_y); } else { diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h index e084e53fe4..3c0cad3ca4 100644 --- a/src/gpu/cl/operators/ClGemm.h +++ b/src/gpu/cl/operators/ClGemm.h @@ -121,7 +121,6 @@ private: std::unique_ptr<kernels::ClGemmMatrixMultiplyNativeKernel> _mm_native_kernel; std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel> _mm_reshaped_kernel; std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel; - std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_fallback_kernel; TensorInfo _tmp_a; TensorInfo _tmp_b; bool _reshape_b_only_on_first_run; |