From 9ae06d4986bc3055f7786c1097b465bd321cf8eb Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 22 Oct 2020 16:37:12 +0100 Subject: COMPMID-3925: Dispatch CLGEMM with no padding y requirement - Add has_pad_y flag in GEMMKernelInfo - Skip reinterpret as 3D in CLGEMMMatrixMultiplyReshapedOnlyRHSKernel if has_pad_y = false - Add test to validate CLGEMMMatrixMultiplyReshapedOnlyRHSkernel with had_pad_y = false/true - Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel to run with has_pad_y = false/true in CLGEMM - Check if the lhs/dst tensors have pad y. If not, run CLGEMMMatrixMultiplyReshapedOnlyRHSKernel without padding requirement Change-Id: I68bb43389789736d676b899ac7c77fd9138babaf Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4248 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h | 1 + arm_compute/core/KernelDescriptors.h | 4 +- arm_compute/runtime/CL/functions/CLGEMM.h | 4 + src/core/CL/cl_kernels/gemm.cl | 20 ++-- src/core/CL/cl_kernels/gemm_helpers.h | 16 +-- src/core/CL/cl_kernels/gemmlowp.cl | 14 +-- .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp | 66 ++++++------ src/runtime/CL/functions/CLGEMM.cpp | 52 ++++++--- tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp | 15 +++ .../CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp | 120 ++++----------------- tests/validation/fixtures/GEMMFixture.h | 15 ++- 11 files changed, 150 insertions(+), 177 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h index fc21f2a0f6..eab7fd219e 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h @@ -162,6 +162,7 @@ private: bool _add_bias; bool _broadcast_bias; bool _export_to_cl_image; + bool _has_pad_y; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H*/ diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h index 1ee1686fb1..ea46bfa5a6 100644 --- a/arm_compute/core/KernelDescriptors.h +++ b/arm_compute/core/KernelDescriptors.h @@ -64,6 +64,7 @@ struct GEMMKernelInfo bool ireinterpret_input_as_3d, bool ibroadcast_bias, bool ifp_mixed_precision, + bool ihas_pad_y, ActivationLayerInfo iactivation_info, int inmult_transpose1xW_width, int imult_interleave4x4_height, @@ -72,7 +73,7 @@ struct GEMMKernelInfo int32_t ina_offset, int32_t inb_offset) : m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision), - activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info), rhs_info(irhs_info), + has_pad_y(ihas_pad_y), activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info), rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset) { } @@ -84,6 +85,7 @@ struct GEMMKernelInfo bool reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */ bool broadcast_bias{ false }; /**< Flag used to broadcast the bias addition */ bool fp_mixed_precision{ false }; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */ + bool has_pad_y{ false }; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */ ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */ int mult_transpose1xW_width{ 1 }; /**< Multiplication factor for the width of the 1xW transposed block */ int mult_interleave4x4_height{ 1 }; /**< Multiplication factor for the height of the 4x4 interleaved block */ diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h index 6e9cf0e2ca..92f9736e35 100644 --- a/arm_compute/runtime/CL/functions/CLGEMM.h +++ b/arm_compute/runtime/CL/functions/CLGEMM.h @@ -206,11 +206,15 @@ private: weights_transformations::CLGEMMReshapeRHSMatrixKernelManaged _reshape_rhs_kernel_managed; CLGEMMMatrixMultiplyReshapedKernel _mm_reshaped_kernel; CLGEMMMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel; + CLGEMMMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_fallback_kernel; CLTensor _tmp_a; CLTensor _tmp_b; const ICLTensor *_original_b; + const ICLTensor *_lhs; + ICLTensor *_dst; bool _reshape_b_only_on_first_run; bool _is_prepared; + bool _has_pad_y; CLGEMMKernelType _gemm_kernel_type; }; } // namespace arm_compute diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index 653aa5591c..fa93760847 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -1121,7 +1121,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -1227,7 +1227,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -1418,7 +1418,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -1573,7 +1573,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -1839,7 +1839,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -1969,7 +1969,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -2157,7 +2157,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -2278,7 +2278,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -4120,7 +4120,7 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -4232,7 +4232,7 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h index 2534204f2f..54d38655a4 100644 --- a/src/core/CL/cl_kernels/gemm_helpers.h +++ b/src/core/CL/cl_kernels/gemm_helpers.h @@ -624,49 +624,49 @@ * @{ */ #define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index 950faeca0b..4a05635669 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -433,7 +433,7 @@ __kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -567,7 +567,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -604,7 +604,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -781,7 +781,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -819,7 +819,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -1009,7 +1009,7 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -1080,7 +1080,7 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp index 912c763ed5..68f761b9e7 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp @@ -133,7 +133,8 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) + // This approach should only be used when the input/output tensors have pad on the y direction + if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) { reinterpret_output_as_3d = false; } @@ -159,16 +160,6 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowStatic input0_access(input0, 0, 0, - input0->dimension(0), - input0->dimension(1)); - AccessWindowStatic input1_access(input1, 0, 0, - input1->dimension(0), - input1->dimension(1)); - AccessWindowStatic output_access(output, 0, 0, - output->dimension(0), - output->dimension(1)); - if(input2 != nullptr) { const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; @@ -177,17 +168,9 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x), input2->dimension(1)); - window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor - } - else - { - window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor + window_changed = update_window_and_padding(win, input2_access); } - output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape())); - // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS Window collapsed = win; @@ -201,7 +184,7 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel() : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false), - _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false) + _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false), _has_pad_y(false) { } @@ -232,10 +215,13 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext _add_bias = _input2 != nullptr; _broadcast_bias = gemm_info.broadcast_bias; _export_to_cl_image = rhs_info.export_to_cl_image; + _has_pad_y = gemm_info.has_pad_y; + + auto padding_info = get_padding_info({ input0, input1, output }); // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -257,6 +243,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1); + // These variables are used only if gemm_info.has_pad_y == true const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1); const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2); @@ -274,11 +261,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); @@ -296,6 +279,13 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + if(_has_pad_y) + { + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + } std::string kernel_name("gemm_mm_reshaped_only_rhs_"); kernel_name += rhs_info.transpose ? "t" : "nt"; @@ -307,6 +297,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext // Set config_id for enabling LWS tuning _config_id = kernel_name; _config_id += "_"; + _config_id += (_has_pad_y ? "" : "no_pad_y_"); _config_id += (_add_bias ? "add_bias_" : ""); _config_id += (_broadcast_bias ? "broadcast_bias_" : ""); _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); @@ -331,6 +322,8 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext _config_id += support::cpp11::to_string(rhs_info.h0); _config_id += "_"; _config_id += support::cpp11::to_string(rhs_info.interleave); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } Status CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, @@ -363,15 +356,24 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); } + const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y? 3u : 2u; + const size_t rhs_idx_batch_size = 2u; + const size_t bia_idx_batch_size = 2u; + const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y? 3u : 2u; + Window slice = window.first_slice_window_3D(); Window slice_matrix_b = slice; slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + // Get cross plane pads const unsigned int total_cross_plane_pad_lhs = _input0->info()->padding().top + _input0->info()->padding().bottom; const unsigned int total_cross_plane_pad_out = _output->info()->padding().top + _output->info()->padding().bottom; + // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor + ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0))); + cl::Image2D input1_image2d; if(_export_to_cl_image) @@ -414,28 +416,28 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co add_2D_tensor_argument(idx, _output, slice); // LHS stride_z - _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[lhs_idx_batch_size])); // RHS stride_z (not used if _export_to_cl_image == true) - _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[rhs_idx_batch_size])); // Bias stride_z (if _add_bias == true) if(_add_bias) { - _kernel.setArg(idx++, static_cast(_input2->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_input2->info()->strides_in_bytes()[bia_idx_batch_size])); } // Output stride_z - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[out_idx_batch_size])); // Cross-plan padding (if _reinterpret_input_as_3d = true) - if(_reinterpret_input_as_3d) + if(_reinterpret_input_as_3d && _has_pad_y) { _kernel.setArg(idx++, static_cast(total_cross_plane_pad_lhs)); } // Cross-plan padding (if _reinterpret_output_as_3d = true) - if(_reinterpret_output_as_3d) + if(_reinterpret_output_as_3d && _has_pad_y) { _kernel.setArg(idx++, static_cast(total_cross_plane_pad_out)); } diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index ccae6713a6..80c5496ede 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -60,11 +60,15 @@ CLGEMM::CLGEMM(std::shared_ptr memory_manager, IWeightsManager * _reshape_rhs_kernel_managed(), _mm_reshaped_kernel(), _mm_reshaped_only_rhs_kernel(), + _mm_reshaped_only_rhs_fallback_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), + _lhs(nullptr), + _dst(nullptr), _reshape_b_only_on_first_run(false), _is_prepared(false), + _has_pad_y(false), _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1) { } @@ -295,16 +299,8 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context std::unique_ptr gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target); ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get()); - unsigned int m_internal = m; - unsigned int b_internal = batch_size; - if(reinterpret_input_as_3d) - { - m_internal = a->info()->dimension(1); - b_internal = a->info()->dimension(2); - } - // Configure lhs_info and rhs_info - std::tie(lhs_info, rhs_info) = gemm_config->configure(m_internal, n, k, b_internal, data_type); + std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type); ICLTensor *reshaped_rhs = &_tmp_b; if(_weights_manager && _weights_manager->are_weights_managed(b)) @@ -317,9 +313,18 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info); } - // Configure and tune matrix multiply kernel + // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true) + // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have + // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false + + // Configure matrix multiply kernel with no y padding support + kernel_info.has_pad_y = false; _mm_reshaped_only_rhs_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + // Configure matrix multiply kernel with y padding support + kernel_info.has_pad_y = true; + _mm_reshaped_only_rhs_fallback_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + if(!_reshape_b_only_on_first_run && use_mm_b) { _tmp_b.allocator()->allocate(); @@ -493,6 +498,10 @@ Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info)); // Validate matrix multiply + kernel_info.has_pad_y = false; + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + + kernel_info.has_pad_y = true; ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); return Status{}; @@ -514,6 +523,8 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _is_prepared = gemm_info.retain_internal_weights(); _original_b = b; + _lhs = a; + _dst = output; // Get the GPU target bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); @@ -608,7 +619,6 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso void CLGEMM::run() { prepare(); - MemoryGroupResourceScope scope_mg(_memory_group); // Run matrix multiply kernel @@ -675,8 +685,14 @@ void CLGEMM::run() CLScheduler::get().enqueue(_reshape_rhs_kernel, false); } } - - CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true); + if(_has_pad_y) + { + CLScheduler::get().enqueue(_mm_reshaped_only_rhs_fallback_kernel, true); + } + else + { + CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true); + } break; } default: @@ -690,6 +706,16 @@ void CLGEMM::prepare() { if(!_is_prepared) { + // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement + if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + { + // Check if the lhs or dst tensors have padding + const unsigned int cross_plane_pad_lhs = _lhs->info()->padding().top + _lhs->info()->padding().bottom; + const unsigned int cross_plane_pad_dst = _dst->info()->padding().top + _dst->info()->padding().bottom; + + _has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0); + } + if(_gemm_kernel_type != CLGEMMKernelType::NATIVE_V1 && _reshape_b_only_on_first_run) { if(_weights_manager && _weights_manager->are_weights_managed(_original_b)) diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp index 98149ce149..5629a80f8e 100644 --- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp +++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp @@ -340,6 +340,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -354,6 +355,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -371,6 +373,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, false /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -386,6 +389,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, false /**< Flag used to broadcast the bias addition */, true /**< wider accumm */, + true /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -400,6 +404,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, false /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -573,6 +578,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -586,6 +592,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -599,6 +606,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -613,6 +621,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -626,6 +635,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -939,6 +949,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -952,6 +963,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -965,6 +977,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -979,6 +992,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, @@ -992,6 +1006,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi false /**< reinterpret the input as 3D */, true /**< Flag used to broadcast the bias addition */, false /**< wider accumm */, + false /**< has pad y */, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp index d792afac1d..33912ae2ba 100644 --- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp +++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp @@ -92,13 +92,12 @@ const auto n_values = framework::dataset::make("N", 51); const auto k_values = framework::dataset::make("K", 23); /** Batch size values to test */ -const auto b_values = framework::dataset::make("batch_size", 1, 3); +const auto b_values = framework::dataset::make("batch_size", 2); /** Activation values to test */ const auto act_values = framework::dataset::make("Activation", { - ActivationLayerInfo(), - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, 2.f), + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, -0.8f, 10.f), }); /** M0 values to test - precommit */ @@ -211,70 +210,6 @@ bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned CLGEMMMatrixMultiplyReshapedOnlyRHS gemm; return bool(gemm.validate(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info)); } - -/** Zero padding test */ -bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, - unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value, - bool i_value_rhs, bool t_value_rhs, bool export_to_cl_image, bool broadcast_bias, bool input_as_3d, unsigned int depth_output_gemm3d, const ActivationLayerInfo &act_info, - DataType dt_input0, DataType dt_input1, DataType dt_input2, DataType dt_output, float alpha, float beta) -{ - const unsigned int M = m_value; - const unsigned int N = n_value; - const unsigned int K = k_value; - - GEMMLHSMatrixInfo lhs_info; - lhs_info.m0 = m0_value; - lhs_info.k0 = k0_value; - - GEMMRHSMatrixInfo rhs_info; - rhs_info.n0 = n0_value; - rhs_info.k0 = k0_value; - rhs_info.h0 = h0_value; - rhs_info.interleave = i_value_rhs; - rhs_info.transpose = t_value_rhs; - rhs_info.export_to_cl_image = export_to_cl_image; - - GEMMKernelInfo kernel_info; - kernel_info.m = M; - kernel_info.n = N; - kernel_info.k = K; - kernel_info.depth_output_gemm3d = depth_output_gemm3d; - kernel_info.reinterpret_input_as_3d = input_as_3d; - kernel_info.broadcast_bias = broadcast_bias; - kernel_info.activation_info = act_info; - - const TensorShape lhs_shape(K, M, b_value); - const TensorShape rhs_shape(N, K, b_value); - const TensorShape rhs_shape_reshaped = compute_rhs_reshaped_shape(TensorInfo(rhs_shape, 1, dt_input1), - rhs_info); - - const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape, 1, dt_input0), - TensorInfo(rhs_shape_reshaped, 1, dt_input1), - kernel_info); - - const TensorShape bias_shape(N, - M, // Correct calculation should be: broadcast_bias? 1 : M, it's wrong here on purpose just for validation test - broadcast_bias? 1 : b_value); - - // Create tensors - CLTensor lhs = create_tensor(lhs_shape, dt_input0); - CLTensor rhs_reshaped = create_tensor(rhs_shape_reshaped, dt_input1); - CLTensor bias = create_tensor(bias_shape, dt_input2); - CLTensor dst = create_tensor(dst_shape, dt_output); - - ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); - - // Validate zero-padding - CLGEMMMatrixMultiplyReshapedOnlyRHS gemm; - - gemm.configure(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info); - - // Padding can be added along rhs and bias's X dimension - return dst.info()->padding().empty() && lhs.info()->padding().empty() && bias.info()->padding().bottom == 0 && bias.info()->padding().top == 0; -} } // namespace TEST_SUITE(CL) @@ -326,33 +261,6 @@ b_value, m0_value, n0_value, k0_value, broadcast_bias, input_as_3d, depth_output ARM_COMPUTE_EXPECT(status == expected_value, framework::LogLevel::ERRORS); } -/** Validate zero padding tests - * - * A series of validation tests to check that no padding is added as part of configuration for 4 different scenarios. - * - * Checks performed in order: - * - No partial blocks in both x and y dimensions - * - Partial blocks in x dimension - * - Partial blocks in y dimension - * - Partial blocks in both x and y dimensions - * - Special case: partial_n0 == 9 (vstore1 should be invoked instead of vstore_partial_1) - */ -DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(zip( -framework::dataset::make("M", { 24, 64, 101, 1, 100 }), -framework::dataset::make("N", { 48, 29, 16, 122, 41 })), -framework::dataset::make("M0", { 4, 8, 7, 2, 1 })), -framework::dataset::make("N0", { 4, 4, 16, 3, 16 })), -framework::dataset::make("export_to_cl_image", { false, true, true, false, false })), -m_value, n_value, m0_value, n0_value, export_to_cl_image) -{ - constexpr DataType dt = DataType::F32; - // Disable export_to_cl_image if the target platform does not support the OpenCL cl_khr_image2d_from_buffer extension - bool actual_export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device()) && export_to_cl_image; - - bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, 1, false, false, actual_export_to_cl_image, false, 0, 0, ActivationLayerInfo(), dt, dt, dt, dt, 1.0f, 1.0f); - ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS); -} - TEST_SUITE(Float) TEST_SUITE(FP32) @@ -443,7 +351,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture, framework::DatasetMode::PRECOMMIT, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -456,6 +364,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", false)), + framework::dataset::make("has_pad_y", {false, true})), framework::dataset::make("DataType", DataType::F32)), a_values), beta_values), @@ -466,7 +375,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt } FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture, framework::DatasetMode::NIGHTLY, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -479,6 +388,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", false)), + framework::dataset::make("has_pad_y", {false, true})), framework::dataset::make("DataType", DataType::F32)), a_values), beta_values), @@ -552,7 +462,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture, framework::DatasetMode::PRECOMMIT, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -565,6 +475,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("has_pad_y", {false, true})), framework::dataset::make("DataType", DataType::F32)), a_values), beta_values), @@ -575,7 +486,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt } FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture, framework::DatasetMode::NIGHTLY, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -588,6 +499,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("has_pad_y", {false, true})), framework::dataset::make("DataType", DataType::F32)), a_values), beta_values), @@ -647,7 +559,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture, framework::DatasetMode::PRECOMMIT, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -660,6 +572,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", false)), + framework::dataset::make("has_pad_y", {false, true})), framework::dataset::make("DataType", DataType::F16)), a_values), beta_values), @@ -670,7 +583,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt } FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture, framework::DatasetMode::NIGHTLY, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -683,6 +596,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", false)), + framework::dataset::make("has_pad_y", {false, true})), framework::dataset::make("DataType", DataType::F16)), a_values), beta_values), @@ -756,7 +670,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture, framework::DatasetMode::PRECOMMIT, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -769,6 +683,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("has_pad_y", {false, true})), framework::dataset::make("DataType", DataType::F16)), a_values), beta_values), @@ -779,7 +694,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt } FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture, framework::DatasetMode::NIGHTLY, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -792,6 +707,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("has_pad_y", {false, true})), framework::dataset::make("DataType", DataType::F16)), a_values), beta_values), diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h index 0a964a7114..6288b6b970 100644 --- a/tests/validation/fixtures/GEMMFixture.h +++ b/tests/validation/fixtures/GEMMFixture.h @@ -1125,7 +1125,7 @@ class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::F public: template void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0, - bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, const ActivationLayerInfo &act_info) + bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, bool has_pad_y, DataType data_type, float alpha, float beta, const ActivationLayerInfo &act_info) { GEMMLHSMatrixInfo lhs_info; lhs_info.m0 = m0; @@ -1147,7 +1147,7 @@ public: const TensorShape rhs_shape(n, k, batch_size); const TensorShape bias_shape(n, 1, 1); - _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, m_h, act_info); + _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, m_h, act_info, has_pad_y); _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, m_h, act_info); } @@ -1161,7 +1161,7 @@ protected: TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, DataType data_type, float alpha, float beta, - unsigned int m_h, const ActivationLayerInfo &act_info) + unsigned int m_h, const ActivationLayerInfo &act_info, bool has_pad_y) { // Create tensors TensorType lhs = create_tensor(lhs_shape, data_type, 1); @@ -1181,15 +1181,22 @@ protected: kernel_info.reinterpret_input_as_3d = false; kernel_info.broadcast_bias = true; kernel_info.activation_info = act_info; + kernel_info.has_pad_y = has_pad_y; // The output tensor will be auto-initialized within the function - // Create and configure function ReshapeRHSFunctionType reshape_rhs; GEMMFunctionType gemm; reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info); gemm.configure(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info); + if(has_pad_y) + { + // Add dummy padding into lhs to validate has_pad_y path + lhs.info()->extend_padding(PaddingSize(2, 0, 2, 0)); + dst.info()->extend_padding(PaddingSize(2, 0, 1, 0)); + } + ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS); -- cgit v1.2.1