aboutsummaryrefslogtreecommitdiff
path: root/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
diff options
context:
space:
mode:
authorramelg01 <ramy.elgammal@arm.com>2021-11-11 10:05:00 +0000
committerGian Marco Iodice <gianmarco.iodice@arm.com>2021-11-20 17:38:07 +0000
commit9cca592c13f1e688a35698641069bcd37a525f0c (patch)
tree8f69b654c5f543d918ec5d61140af30bbadbd390 /src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
parente330fb41d85d7058f74902ce1d47b2dc00b10a52 (diff)
downloadComputeLibrary-9cca592c13f1e688a35698641069bcd37a525f0c.tar.gz
Improve start-up timer for GeMM (floating-point):
- Pass M,N,K at runtime as kernel parameters - Add a guard macro to compile only kernel of interest - Move reshpaing kernels to gemm_utils.cl - Remove the fallback reshaping kernel with Y-Padding support Resolves: COMPMID-4888 Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com> Change-Id: Ida3851326f0b77e410633271de9ecca106e37931 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6662 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp')
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp22
1 files changed, 14 insertions, 8 deletions
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index 64e99332fd..6a450b652b 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -201,7 +201,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
_use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
_add_bias = src2 != nullptr;
_export_to_cl_image = rhs_info.export_to_cl_image;
- _k = gemm_info.k;
_num_post_op_args = gemm_info.post_ops.total_num_arguments();
// Check if we need to slide the matrix B
@@ -230,6 +229,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+ _m = gemm_info.m;
+ _n = gemm_info.n;
+ _k = gemm_info.k;
// Create build options
CLBuildOptions build_opts;
@@ -250,9 +252,6 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
- build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m));
- build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
- build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -278,6 +277,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
+
// Create kernel
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -399,9 +401,6 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
add_2D_tensor_argument(idx, post_op_arg, slice);
}
- // K dimension (not used if _export_to_cl_image == true)
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-
// LHS stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
@@ -429,6 +428,13 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
}
+ // Pass m, n and k at runtime
+ _kernel.setArg<cl_int>(idx++, _m);
+ _kernel.setArg<cl_int>(idx++, _n);
+
+ // K dimension (not used if _export_to_cl_image == true)
+ _kernel.setArg<cl_int>(idx++, _k);
+
// Dispatch kernel
enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
}
@@ -436,4 +442,4 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
}
} // namespace kernels
} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute