diff options
author | Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> | 2023-05-25 16:48:43 +0100 |
---|---|---|
committer | Mohmun02 <MohammedSuhail.Munshi@arm.com> | 2023-06-16 15:38:39 +0000 |
commit | 94abde4f4e98f6f1adb5c46b194527f34a8ea07d (patch) | |
tree | d6d717031788850d970fb44ff3f41de311cc5fc0 /src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp | |
parent | dd8d7f4102653ef55d872c71ae5d5f2ca2ead0c1 (diff) | |
download | ComputeLibrary-94abde4f4e98f6f1adb5c46b194527f34a8ea07d.tar.gz |
Add Fused Activation to OpenCL MatMul
- Added fused activation to MatMul function interface
- Added fused activation to CL backend
- Includes tests for supported Activation Functions in MatMul
Resolves: [COMPMID-6192]
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: Ie103212b600b60699eaf6a6394d609e6e1f5aba6
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/522465
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9714
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp')
-rw-r--r-- | src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp | 50 |
1 files changed, 28 insertions, 22 deletions
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp index d5ecdf7dd2..9bbec908a3 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp @@ -98,34 +98,36 @@ ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel() { _type = CLKernelType::GEMM; } -Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info) +Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output); + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); - if(output->total_size() != 0) + if(dst->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output); + const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } return Status{}; } -void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info) +void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + const ActivationLayerInfo &act_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info); - ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info); - ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info); + ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_kernel_info)); // output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); - const int m = output->dimension(1); - const int n = output->dimension(0); + const int m = dst->dimension(1); + const int n = dst->dimension(0); const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); const bool adj_lhs = matmul_kernel_info.adj_lhs; @@ -133,7 +135,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context int n0 = adjust_vec_size(matmul_kernel_info.n0, n); // Configure kernel window - Window win = calculate_max_window(*output, Steps(n0, m0)); + Window win = calculate_max_window(*dst, Steps(n0, m0)); win = win.collapse(win, Window::DimZ); IClKernel::configure_internal(win); @@ -152,7 +154,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform(); const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform(); - const UniformQuantizationInfo dqinfo = output->quantization_info().uniform(); + const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform(); float multiplier = lqinfo.scale * rqinfo.scale / dqinfo.scale; int output_multiplier = 0; @@ -166,6 +168,10 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(-rqinfo.offset)); // Note this is passed as negative to maintain similarity with CLDirectConv2D build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset)); // Passed as positive (unlike the above two) + build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()))); + build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()))); + build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + std::string kernel_name("mat_mul_native_quantized"); kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt"; kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt"; @@ -177,7 +183,7 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set config_id for enabling LWS tuning - const size_t number_of_batches = output->tensor_shape().total_size() / (m * n); + const size_t number_of_batches = dst->tensor_shape().total_size() / (m * n); _config_id = kernel_name; _config_id += "_"; @@ -203,18 +209,18 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - ICLTensor *output = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); - ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output); + const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst); unsigned int idx = 0; Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ); add_3d_tensor_nhw_argument(idx, lhs); add_3d_tensor_nhw_argument(idx, rhs); - add_3d_tensor_nhw_argument(idx, output); + add_3d_tensor_nhw_argument(idx, dst); enqueue(queue, *this, window_collapsed, lws_hint()); } |