diff options
Diffstat (limited to 'src/core/CL/cl_kernels/common/mat_mul.cl')
-rw-r--r-- | src/core/CL/cl_kernels/common/mat_mul.cl | 247 |
1 files changed, 159 insertions, 88 deletions
diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl index 9656a59728..c7ef8ae52b 100644 --- a/src/core/CL/cl_kernels/common/mat_mul.cl +++ b/src/core/CL/cl_kernels/common/mat_mul.cl @@ -25,6 +25,21 @@ #include "helpers.h" #include "tile_helpers.h" +#ifdef BIAS +// This function performs in-place bias addition for float/half datatype when bias is enabled. +// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4). +inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x) +{ + TILE(DATA_TYPE, 1, N0, bias_tile); + + // below expands to use bias_ptr and bias_offset_first_element_in_bytes + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile); + + // c = c + bias[broadcasted] + T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc); +} +#endif // defined(BIAS) + #if defined(MAT_MUL_NATIVE_NT_NT) /** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only * @@ -43,32 +58,42 @@ * - K0 = 1, 2, 3, 4, 8, 16 * @note Values > 8 for M0 are not expected to be efficient * - * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16 - * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes) - * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes) - * @param[in] lhs_w The width of the lhs tensor - * @param[in] lhs_h The height of the lhs tensor - * @param[in] lhs_n Number of the matrices (buffers) in the batch - * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix - * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE - * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr - * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes) - * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes) - * @param[in] rhs_w The width of the rhs tensor - * @param[in] rhs_h The height of the rhs tensor - * @param[in] rhs_n Number of the matrices (buffers) in the batch - * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix - * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr - * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes) - * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes) - * @param[in] dst_w The width of the dst tensor - * @param[in] dst_h The height of the dst tensor - * @param[in] dst_n Number of the matrices (buffers) in the batch - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix + * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16 + * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes) + * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes) + * @param[in] lhs_w The width of the lhs tensor + * @param[in] lhs_h The height of the lhs tensor + * @param[in] lhs_n Number of the matrices (buffers) in the batch + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix + * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE + * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr + * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes) + * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes) + * @param[in] rhs_w The width of the rhs tensor + * @param[in] rhs_h The height of the rhs tensor + * @param[in] rhs_n Number of the matrices (buffers) in the batch + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix + * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr + * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor + * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor + * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr + * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes) + * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes) + * @param[in] dst_w The width of the dst tensor + * @param[in] dst_h The height of the dst tensor + * @param[in] dst_n Number of the matrices (buffers) in the batch + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix */ __kernel void mat_mul_native_nt_nt( TENSOR3D_T(lhs, BUFFER), TENSOR3D_T(rhs, RHS_TENSOR_TYPE), +#ifdef BIAS + TENSOR3D_T(bias, BUFFER), +#endif // defined(BIAS) TENSOR3D_T(dst, BUFFER)) { const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0); @@ -149,6 +174,10 @@ __kernel void mat_mul_native_nt_nt( indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond)); }); +#ifdef BIAS + perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x); +#endif // defined(BIAS) + T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer); @@ -173,31 +202,41 @@ __kernel void mat_mul_native_nt_nt( * - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE) * @note Values > 8 for M0, N0 and K0 are not expected to be efficient * - * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16 - * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes) - * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes) - * @param[in] lhs_w The width of the lhs tensor - * @param[in] lhs_h The height of the lhs tensor - * @param[in] lhs_n Number of the matrices (buffers) in the batch - * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix - * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE - * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr - * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes) - * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes) - * @param[in] rhs_w The width of the rhs tensor - * @param[in] rhs_h The height of the rhs tensor - * @param[in] rhs_n Number of the matrices (buffers) in the batch - * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix - * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr - * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes) - * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes) - * @param[in] dst_w The width of the dst tensor - * @param[in] dst_h The height of the dst tensor - * @param[in] dst_n Number of the matrices (buffers) in the batch - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix + * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16 + * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes) + * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes) + * @param[in] lhs_w The width of the lhs tensor + * @param[in] lhs_h The height of the lhs tensor + * @param[in] lhs_n Number of the matrices (buffers) in the batch + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix + * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE + * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr + * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes) + * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes) + * @param[in] rhs_w The width of the rhs tensor + * @param[in] rhs_h The height of the rhs tensor + * @param[in] rhs_n Number of the matrices (buffers) in the batch + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix + * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr + * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor + * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor + * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr + * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes) + * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes) + * @param[in] dst_w The width of the dst tensor + * @param[in] dst_h The height of the dst tensor + * @param[in] dst_n Number of the matrices (buffers) in the batch + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix */ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER), TENSOR3D_T(rhs, RHS_TENSOR_TYPE), +#ifdef BIAS + TENSOR3D_T(bias, BUFFER), +#endif // defined(BIAS) TENSOR3D_T(dst, BUFFER)) { @@ -306,6 +345,10 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER), indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond)); }); +#ifdef BIAS + perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x); +#endif // defined(BIAS) + T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer); @@ -330,32 +373,42 @@ __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER), * - K0 > 0 * * @note Values > 8 for M0, and K0 are not expected to be efficient * - * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16 - * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes) - * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes) - * @param[in] lhs_w The width of the lhs tensor - * @param[in] lhs_h The height of the lhs tensor - * @param[in] lhs_n Number of the matrices (buffers) in the batch - * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix - * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE - * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr - * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes) - * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes) - * @param[in] rhs_w The width of the rhs tensor - * @param[in] rhs_h The height of the rhs tensor - * @param[in] rhs_n Number of the matrices (buffers) in the batch - * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix - * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr - * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes) - * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes) - * @param[in] dst_w The width of the dst tensor - * @param[in] dst_h The height of the dst tensor - * @param[in] dst_n Number of the matrices (buffers) in the batch - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix + * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16 + * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes) + * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes) + * @param[in] lhs_w The width of the lhs tensor + * @param[in] lhs_h The height of the lhs tensor + * @param[in] lhs_n Number of the matrices (buffers) in the batch + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix + * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE + * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr + * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes) + * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes) + * @param[in] rhs_w The width of the rhs tensor + * @param[in] rhs_h The height of the rhs tensor + * @param[in] rhs_n Number of the matrices (buffers) in the batch + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix + * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr + * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor + * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor + * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr + * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes) + * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes) + * @param[in] dst_w The width of the dst tensor + * @param[in] dst_h The height of the dst tensor + * @param[in] dst_n Number of the matrices (buffers) in the batch + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix */ __kernel void mat_mul_native_t_nt( TENSOR3D_T(lhs, BUFFER), TENSOR3D_T(rhs, RHS_TENSOR_TYPE), +#ifdef BIAS + TENSOR3D_T(bias, BUFFER), +#endif // defined(BIAS) TENSOR3D_T(dst, BUFFER)) { const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0); @@ -459,6 +512,10 @@ __kernel void mat_mul_native_t_nt( indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond)); }); +#ifdef BIAS + perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x); +#endif // defined(BIAS) + T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer); @@ -483,32 +540,42 @@ __kernel void mat_mul_native_t_nt( * - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE) * @note Values > 8 for M0, N0 and K0 are not expected to be efficient * - * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16 - * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes) - * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes) - * @param[in] lhs_w The width of the lhs tensor - * @param[in] lhs_h The height of the lhs tensor - * @param[in] lhs_n Number of the matrices (buffers) in the batch - * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix - * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE - * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr - * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes) - * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes) - * @param[in] rhs_w The width of the rhs tensor - * @param[in] rhs_h The height of the rhs tensor - * @param[in] rhs_n Number of the matrices (buffers) in the batch - * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix - * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr - * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes) - * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes) - * @param[in] dst_w The width of the dst tensor - * @param[in] dst_h The height of the dst tensor - * @param[in] dst_n Number of the matrices (buffers) in the batch - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix + * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16 + * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes) + * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes) + * @param[in] lhs_w The width of the lhs tensor + * @param[in] lhs_h The height of the lhs tensor + * @param[in] lhs_n Number of the matrices (buffers) in the batch + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix + * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE + * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr + * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes) + * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes) + * @param[in] rhs_w The width of the rhs tensor + * @param[in] rhs_h The height of the rhs tensor + * @param[in] rhs_n Number of the matrices (buffers) in the batch + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix + * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr + * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor + * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor + * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr, + * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes) + * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes) + * @param[in] dst_w The width of the dst tensor + * @param[in] dst_h The height of the dst tensor + * @param[in] dst_n Number of the matrices (buffers) in the batch + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix */ __kernel void mat_mul_native_t_t( TENSOR3D_T(lhs, BUFFER), TENSOR3D_T(rhs, RHS_TENSOR_TYPE), +#ifdef BIAS + TENSOR3D_T(bias, BUFFER), +#endif // defined(BIAS) TENSOR3D_T(dst, BUFFER)) { const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0); @@ -630,6 +697,10 @@ __kernel void mat_mul_native_t_t( indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond)); }); +#ifdef BIAS + perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x); +#endif // defined(BIAS) + T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc); T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer); |