aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels
diff options
context:
space:
mode:
authorMohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>2023-06-27 14:25:58 +0100
committerMohmun02 <MohammedSuhail.Munshi@arm.com>2023-07-11 08:53:19 +0000
commit8e2dedea8550b1c18c3bbeead8c972f661dcfac8 (patch)
tree61cd0326b9690e343d62a5c72d935fcd68017eb9 /src/core/CL/cl_kernels
parent5ff480265a110ea1f2ce24491e082f52348b0f92 (diff)
downloadComputeLibrary-8e2dedea8550b1c18c3bbeead8c972f661dcfac8.tar.gz
Add Bias to MatMul Kernels and add support for use in Fully Connected Layer
Resolves: [COMPMID-6316] Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> Change-Id: I08e6bac9e6b46b76978da0dc6a48ccfe3dde5086 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9833 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r--src/core/CL/cl_kernels/common/mat_mul.cl247
-rw-r--r--src/core/CL/cl_kernels/common/mat_mul_mmul.cl271
-rw-r--r--src/core/CL/cl_kernels/common/mat_mul_quantized.cl239
3 files changed, 485 insertions, 272 deletions
diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl
index 9656a59728..c7ef8ae52b 100644
--- a/src/core/CL/cl_kernels/common/mat_mul.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul.cl
@@ -25,6 +25,21 @@
#include "helpers.h"
#include "tile_helpers.h"
+#ifdef BIAS
+// This function performs in-place bias addition for float/half datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x)
+{
+ TILE(DATA_TYPE, 1, N0, bias_tile);
+
+ // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
#if defined(MAT_MUL_NATIVE_NT_NT)
/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
*
@@ -43,32 +58,42 @@
* - K0 = 1, 2, 3, 4, 8, 16
* @note Values > 8 for M0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
*/
__kernel void mat_mul_native_nt_nt(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER))
{
const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
@@ -149,6 +174,10 @@ __kernel void mat_mul_native_nt_nt(
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
@@ -173,31 +202,41 @@ __kernel void mat_mul_native_nt_nt(
* - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
* @note Values > 8 for M0, N0 and K0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
*/
__kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER))
{
@@ -306,6 +345,10 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
@@ -330,32 +373,42 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
* - K0 > 0
* * @note Values > 8 for M0, and K0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
*/
__kernel void mat_mul_native_t_nt(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER))
{
const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
@@ -459,6 +512,10 @@ __kernel void mat_mul_native_t_nt(
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
@@ -483,32 +540,42 @@ __kernel void mat_mul_native_t_nt(
* - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
* @note Values > 8 for M0, N0 and K0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr,
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
*/
__kernel void mat_mul_native_t_t(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER))
{
const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
@@ -630,6 +697,10 @@ __kernel void mat_mul_native_t_t(
indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
});
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
diff --git a/src/core/CL/cl_kernels/common/mat_mul_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
index a53db27fb8..e549da86d4 100644
--- a/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
@@ -24,6 +24,21 @@
#include "helpers.h"
#include "tile_helpers.h"
+#ifdef BIAS
+// This function performs in-place bias addition for float and half datatypes when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x)
+{
+ TILE(DATA_TYPE, 1, N0, bias_tile);
+
+ // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
#if defined(MAT_MUL_NATIVE_MMUL_NT_NT)
/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS non-transposed - buffer only
*
@@ -40,34 +55,44 @@
* - K0 = 1
* @note Values > 8 for M0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
- * @param[in] M Number of rows in LHS matrix
- * @param[in] N Number of columns in RHS matrix
- * @param[in] K Number of columns in LHS matrix and rows in RHS matrix, which is multiple of MMUL_K0.
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] M Number of rows in LHS matrix
+ * @param[in] N Number of columns in RHS matrix
+ * @param[in] K Number of columns in LHS matrix and rows in RHS matrix, which is multiple of MMUL_K0.
*/
__kernel void mat_mul_native_mmul_nt_nt(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER),
const int M,
const int N,
@@ -90,7 +115,7 @@ __kernel void mat_mul_native_mmul_nt_nt(
// x = [0, ((N / N0) / MMUL_N0) * MMUL_N0 * MMUL_M0)
// x = [0, (N / N0) * MMUL_MO)
const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
- // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+ // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
const uint z = get_global_id(2); // Batch
@@ -347,6 +372,10 @@ __kernel void mat_mul_native_mmul_nt_nt(
#define c c_f32
#endif // defined(HALF_PRECISION)
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
if(dst_x + N0 <= N || N0_LEFTOVER == 0)
{
LOOP_UNROLLING(int, m0, 0, 1, M0,
@@ -391,34 +420,44 @@ __kernel void mat_mul_native_mmul_nt_nt(
* - K0 = 1
* @note Values > 8 for M0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
- * @param[in] M Number of rows in DST matrix
- * @param[in] N Number of columns in DST matrix
- * @param[in] K Number of rows in LHS and RHS matrices, which is multiple of MMUL_K0.
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] M Number of rows in DST matrix
+ * @param[in] N Number of columns in DST matrix
+ * @param[in] K Number of rows in LHS and RHS matrices, which is multiple of MMUL_K0.
*/
__kernel void mat_mul_native_mmul_t_nt(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER),
const int M,
const int N,
@@ -428,7 +467,7 @@ __kernel void mat_mul_native_mmul_t_nt(
// For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
- // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+ // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
const uint z = get_global_id(2); // Batch
@@ -511,6 +550,10 @@ __kernel void mat_mul_native_mmul_t_nt(
#define c c_f32
#endif // defined(HALF_PRECISION)
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
if(dst_x + N0 <= N || N0_LEFTOVER == 0)
{
LOOP_UNROLLING(int, m0, 0, 1, M0,
@@ -554,34 +597,44 @@ __kernel void mat_mul_native_mmul_t_nt(
* - K0 = 1
* @note Values > 8 for M0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
- * @param[in] M Number of rows in LHS matrix
- * @param[in] N Number of columns in RHS matrix
- * @param[in] K Number of columns in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] M Number of rows in LHS matrix
+ * @param[in] N Number of columns in RHS matrix
+ * @param[in] K Number of columns in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.
*/
__kernel void mat_mul_native_mmul_nt_t(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER),
const int M,
const int N,
@@ -591,7 +644,7 @@ __kernel void mat_mul_native_mmul_nt_t(
// For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
- // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+ // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
const uint z = get_global_id(2); // Batch
@@ -679,6 +732,10 @@ __kernel void mat_mul_native_mmul_nt_t(
#define c c_f32
#endif // defined(HALF_PRECISION)
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
if(dst_x + N0 <= N || N0_LEFTOVER == 0)
{
LOOP_UNROLLING(int, m0, 0, 1, M0,
@@ -722,34 +779,44 @@ __kernel void mat_mul_native_mmul_nt_t(
* - K0 = 1
* @note Values > 8 for M0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
- * @param[in] M Number of rows in LHS matrix
- * @param[in] N Number of columns in RHS matrix
- * @param[in] K Number of rows in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] M Number of rows in LHS matrix
+ * @param[in] N Number of columns in RHS matrix
+ * @param[in] K Number of rows in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.
*/
__kernel void mat_mul_native_mmul_t_t(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER),
const int M,
const int N,
@@ -759,7 +826,7 @@ __kernel void mat_mul_native_mmul_t_t(
// For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
- // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+ // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
const uint z = get_global_id(2); // Batch
@@ -847,6 +914,10 @@ __kernel void mat_mul_native_mmul_t_t(
#define c c_f32
#endif // defined(HALF_PRECISION)
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
if(dst_x + N0 <= N || N0_LEFTOVER == 0)
{
LOOP_UNROLLING(int, m0, 0, 1, M0,
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
index 7029af2188..7f81ac4549 100644
--- a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
@@ -25,6 +25,21 @@
#include "helpers.h"
#include "tile_helpers.h"
+#ifdef BIAS
+// This function performs in-place bias addition for integer datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(int, M0, N0, acc), uint x)
+{
+ TILE(int, 1, N0, bias_tile);
+
+ // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+ T_LOAD(int, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
*
@@ -43,31 +58,41 @@
* - K0 = 1, 2, 3, 4, 8, 16
* @note Values > 8 for M0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
*/
__kernel void mat_mul_native_quantized_nt_nt(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER))
{
const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
@@ -197,6 +222,10 @@ __kernel void mat_mul_native_quantized_nt_nt(
const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
// Quantize the tile
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -231,31 +260,41 @@ __kernel void mat_mul_native_quantized_nt_nt(
* - K0 = 1, 2, 3, 4, 8, 16
* @note Values > 8 for M0, N0, K0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
*/
__kernel void mat_mul_native_quantized_nt_t(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER))
{
const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
@@ -377,6 +416,10 @@ __kernel void mat_mul_native_quantized_nt_t(
const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
// Quantize the tile
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -411,31 +454,41 @@ __kernel void mat_mul_native_quantized_nt_t(
* - K0 = 1, 2, 3, 4, 8, 16
* @note Values > 8 for M0, N0 and K0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
*/
__kernel void mat_mul_native_quantized_t_nt(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER))
{
const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
@@ -559,6 +612,10 @@ __kernel void mat_mul_native_quantized_t_nt(
const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
// Quantize the tile
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
@@ -593,31 +650,41 @@ __kernel void mat_mul_native_quantized_t_nt(
* - K0 = 1, 2, 3, 4, 8, 16
* @note Values > 8 for M0, N0 and K0 are not expected to be efficient
*
- * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] lhs_w The width of the lhs tensor
- * @param[in] lhs_h The height of the lhs tensor
- * @param[in] lhs_n Number of the matrices (buffers) in the batch
- * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in] rhs_w The width of the rhs tensor
- * @param[in] rhs_h The height of the rhs tensor
- * @param[in] rhs_n Number of the matrices (buffers) in the batch
- * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in] dst_w The width of the dst tensor
- * @param[in] dst_h The height of the dst tensor
- * @param[in] dst_n Number of the matrices (buffers) in the batch
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] lhs_w The width of the lhs tensor
+ * @param[in] lhs_h The height of the lhs tensor
+ * @param[in] lhs_n Number of the matrices (buffers) in the batch
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
+ * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in] rhs_w The width of the rhs tensor
+ * @param[in] rhs_h The height of the rhs tensor
+ * @param[in] rhs_n Number of the matrices (buffers) in the batch
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
+ * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
+ * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in] dst_w The width of the dst tensor
+ * @param[in] dst_h The height of the dst tensor
+ * @param[in] dst_n Number of the matrices (buffers) in the batch
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
*/
__kernel void mat_mul_native_quantized_t_t(
TENSOR3D_T(lhs, BUFFER),
TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+ TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
TENSOR3D_T(dst, BUFFER))
{
const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
@@ -745,6 +812,10 @@ __kernel void mat_mul_native_quantized_t_t(
const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+#ifdef BIAS
+ perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
// Quantize the tile
TILE(DATA_TYPE, M0, N0, accq);
T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);