aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/gemm.cl
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-07-11 15:54:56 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:10 +0000
commite8bd2c729546e59aa0adc241976ea91fc6f25b52 (patch)
treededa6658ad22ee6f68f96f6221caed5aa80acd10 /src/core/CL/cl_kernels/gemm.cl
parented32f43174ce45cafe9d93e1a0b92cbebbcb7493 (diff)
downloadComputeLibrary-e8bd2c729546e59aa0adc241976ea91fc6f25b52.tar.gz
COMPMID-1384: graph_mobilenet fails for NHWC on OpenCL
Makes GEMM3D account top padding when jumping among planes. Change-Id: Ia7c16cfa5498de106774ce42cbc4716e9f43195b Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/139612 Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/gemm.cl')
-rw-r--r--src/core/CL/cl_kernels/gemm.cl256
1 files changed, 128 insertions, 128 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index f75161ca0a..5a6efe64b9 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -188,7 +188,7 @@ __kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] pad_bottom Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in] cross_plane_pad Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
@@ -198,7 +198,7 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
uint dst_stride_z
#if defined(REINTERPRET_OUTPUT_AS_3D)
,
- uint pad_bottom
+ uint cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
)
{
@@ -286,26 +286,26 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible bottom paddings
+ // in order to take into account the presence of possible cross plane paddings
//
- // | |
- // | plane0 |
- // | |
- // |_____________|
- // |*************|
- // | pad_bottom |
- // |*************|
- // | |
- // | plane1 |
- // | |
- // |_____________|
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
- // Add offset due to the bottom paddings
- zout *= (pad_bottom * dst_stride_y);
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
// multiply dst_stride_z by DEPTH_GEMM3D
@@ -366,7 +366,7 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] pad_bottom Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in] cross_plane_pad Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
@@ -376,7 +376,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
uint dst_stride_z
#if defined(REINTERPRET_OUTPUT_AS_3D)
,
- uint pad_bottom
+ uint cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
)
{
@@ -596,26 +596,26 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible bottom paddings
+ // in order to take into account the presence of possible cross plane paddings
//
- // | |
- // | plane0 |
- // | |
- // |_____________|
- // |*************|
- // | pad_bottom |
- // |*************|
- // | |
- // | plane1 |
- // | |
- // |_____________|
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
- // Add offset due to the bottom paddings
- zout *= (pad_bottom * dst_stride_y);
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
// multiply dst_stride_z by DEPTH_GEMM3D
@@ -679,7 +679,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] pad_bottom Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in] cross_plane_pad Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
@@ -689,7 +689,7 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
uint dst_stride_z
#if defined(REINTERPRET_OUTPUT_AS_3D)
,
- uint pad_bottom
+ uint cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
)
{
@@ -777,26 +777,26 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible bottom paddings
+ // in order to take into account the presence of possible cross plane paddings
//
- // | |
- // | plane0 |
- // | |
- // |_____________|
- // |*************|
- // | pad_bottom |
- // |*************|
- // | |
- // | plane1 |
- // | |
- // |_____________|
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
- // Add offset due to the bottom paddings
- zout *= (pad_bottom * dst_stride_y);
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
// multiply dst_stride_z by DEPTH_GEMM3D
@@ -853,7 +853,7 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in] pad_bottom Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in] cross_plane_pad Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
@@ -863,7 +863,7 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
uint dst_stride_z
#if defined(REINTERPRET_OUTPUT_AS_3D)
,
- uint pad_bottom
+ uint cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
)
{
@@ -1033,26 +1033,26 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible bottom paddings
+ // in order to take into account the presence of possible cross plane paddings
//
- // | |
- // | plane0 |
- // | |
- // |_____________|
- // |*************|
- // | pad_bottom |
- // |*************|
- // | |
- // | plane1 |
- // | |
- // |_____________|
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
- // Add offset due to the bottom paddings
- zout *= (pad_bottom * dst_stride_y);
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
// multiply dst_stride_z by DEPTH_GEMM3D
@@ -1122,7 +1122,7 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] pad_bottom Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in] cross_plane_pad Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
@@ -1132,7 +1132,7 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
uint dst_stride_z
#if defined(REINTERPRET_OUTPUT_AS_3D)
,
- uint pad_bottom
+ uint cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
)
{
@@ -1261,26 +1261,26 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible bottom paddings
+ // in order to take into account the presence of possible cross plane paddings
//
- // | |
- // | plane0 |
- // | |
- // |_____________|
- // |*************|
- // | pad_bottom |
- // |*************|
- // | |
- // | plane1 |
- // | |
- // |_____________|
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
- // Add offset due to the bottom paddings
- zout *= (pad_bottom * dst_stride_y);
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
// multiply dst_stride_z by DEPTH_GEMM3D
@@ -1362,7 +1362,7 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] pad_bottom Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in] cross_plane_pad Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
@@ -1372,7 +1372,7 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
uint dst_stride_z
#if defined(REINTERPRET_OUTPUT_AS_3D)
,
- uint pad_bottom
+ uint cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
)
{
@@ -1657,26 +1657,26 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible bottom paddings
+ // in order to take into account the presence of possible cross plane paddings
//
- // | |
- // | plane0 |
- // | |
- // |_____________|
- // |*************|
- // | pad_bottom |
- // |*************|
- // | |
- // | plane1 |
- // | |
- // |_____________|
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
- // Add offset due to the bottom paddings
- zout *= (pad_bottom * dst_stride_y);
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
// multiply dst_stride_z by DEPTH_GEMM3D
@@ -1750,7 +1750,7 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] pad_bottom Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in] cross_plane_pad Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
@@ -1760,7 +1760,7 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
uint dst_stride_z
#if defined(REINTERPRET_OUTPUT_AS_3D)
,
- uint pad_bottom
+ uint cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
)
{
@@ -1975,26 +1975,26 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible bottom paddings
+ // in order to take into account the presence of possible cross plane paddings
//
- // | |
- // | plane0 |
- // | |
- // |_____________|
- // |*************|
- // | pad_bottom |
- // |*************|
- // | |
- // | plane1 |
- // | |
- // |_____________|
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
- // Add offset due to the bottom paddings
- zout *= (pad_bottom * dst_stride_y);
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
// multiply dst_stride_z by DEPTH_GEMM3D
@@ -2068,7 +2068,7 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] pad_bottom Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in] cross_plane_pad Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
@@ -2078,7 +2078,7 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
uint dst_stride_z
#if defined(REINTERPRET_OUTPUT_AS_3D)
,
- uint pad_bottom
+ uint cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
)
{
@@ -2241,26 +2241,26 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible bottom paddings
+ // in order to take into account the presence of possible cross plane paddings
//
- // | |
- // | plane0 |
- // | |
- // |_____________|
- // |*************|
- // | pad_bottom |
- // |*************|
- // | |
- // | plane1 |
- // | |
- // |_____________|
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
- // Add offset due to the bottom paddings
- zout *= (pad_bottom * dst_stride_y);
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
// multiply dst_stride_z by DEPTH_GEMM3D