From 9ae06d4986bc3055f7786c1097b465bd321cf8eb Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 22 Oct 2020 16:37:12 +0100 Subject: COMPMID-3925: Dispatch CLGEMM with no padding y requirement - Add has_pad_y flag in GEMMKernelInfo - Skip reinterpret as 3D in CLGEMMMatrixMultiplyReshapedOnlyRHSKernel if has_pad_y = false - Add test to validate CLGEMMMatrixMultiplyReshapedOnlyRHSkernel with had_pad_y = false/true - Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel to run with has_pad_y = false/true in CLGEMM - Check if the lhs/dst tensors have pad y. If not, run CLGEMMMatrixMultiplyReshapedOnlyRHSKernel without padding requirement Change-Id: I68bb43389789736d676b899ac7c77fd9138babaf Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4248 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- src/core/CL/cl_kernels/gemm.cl | 20 +++---- src/core/CL/cl_kernels/gemm_helpers.h | 16 +++--- src/core/CL/cl_kernels/gemmlowp.cl | 14 ++--- .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp | 66 +++++++++++----------- 4 files changed, 59 insertions(+), 57 deletions(-) (limited to 'src/core') diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index 653aa5591c..fa93760847 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -1121,7 +1121,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -1227,7 +1227,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -1418,7 +1418,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -1573,7 +1573,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -1839,7 +1839,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -1969,7 +1969,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -2157,7 +2157,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -2278,7 +2278,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -4120,7 +4120,7 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -4232,7 +4232,7 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h index 2534204f2f..54d38655a4 100644 --- a/src/core/CL/cl_kernels/gemm_helpers.h +++ b/src/core/CL/cl_kernels/gemm_helpers.h @@ -624,49 +624,49 @@ * @{ */ #define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); #define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ - Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index 950faeca0b..4a05635669 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -433,7 +433,7 @@ __kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -567,7 +567,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -604,7 +604,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -781,7 +781,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -819,7 +819,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D @@ -1009,7 +1009,7 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_INPUT_AS_3D) // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply lhs_stride_z by DEPTH_GEMM3D @@ -1080,7 +1080,7 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), #if defined(REINTERPRET_OUTPUT_AS_3D) // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we // multiply dst_stride_z by DEPTH_GEMM3D diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp index 912c763ed5..68f761b9e7 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp @@ -133,7 +133,8 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) + // This approach should only be used when the input/output tensors have pad on the y direction + if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) { reinterpret_output_as_3d = false; } @@ -159,16 +160,6 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowStatic input0_access(input0, 0, 0, - input0->dimension(0), - input0->dimension(1)); - AccessWindowStatic input1_access(input1, 0, 0, - input1->dimension(0), - input1->dimension(1)); - AccessWindowStatic output_access(output, 0, 0, - output->dimension(0), - output->dimension(1)); - if(input2 != nullptr) { const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; @@ -177,17 +168,9 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x), input2->dimension(1)); - window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor - } - else - { - window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor + window_changed = update_window_and_padding(win, input2_access); } - output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape())); - // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS Window collapsed = win; @@ -201,7 +184,7 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel() : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false), - _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false) + _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false), _has_pad_y(false) { } @@ -232,10 +215,13 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext _add_bias = _input2 != nullptr; _broadcast_bias = gemm_info.broadcast_bias; _export_to_cl_image = rhs_info.export_to_cl_image; + _has_pad_y = gemm_info.has_pad_y; + + auto padding_info = get_padding_info({ input0, input1, output }); // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -257,6 +243,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1); + // These variables are used only if gemm_info.has_pad_y == true const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1); const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2); @@ -274,11 +261,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); @@ -296,6 +279,13 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + if(_has_pad_y) + { + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + } std::string kernel_name("gemm_mm_reshaped_only_rhs_"); kernel_name += rhs_info.transpose ? "t" : "nt"; @@ -307,6 +297,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext // Set config_id for enabling LWS tuning _config_id = kernel_name; _config_id += "_"; + _config_id += (_has_pad_y ? "" : "no_pad_y_"); _config_id += (_add_bias ? "add_bias_" : ""); _config_id += (_broadcast_bias ? "broadcast_bias_" : ""); _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); @@ -331,6 +322,8 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext _config_id += support::cpp11::to_string(rhs_info.h0); _config_id += "_"; _config_id += support::cpp11::to_string(rhs_info.interleave); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } Status CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, @@ -363,15 +356,24 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); } + const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y? 3u : 2u; + const size_t rhs_idx_batch_size = 2u; + const size_t bia_idx_batch_size = 2u; + const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y? 3u : 2u; + Window slice = window.first_slice_window_3D(); Window slice_matrix_b = slice; slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + // Get cross plane pads const unsigned int total_cross_plane_pad_lhs = _input0->info()->padding().top + _input0->info()->padding().bottom; const unsigned int total_cross_plane_pad_out = _output->info()->padding().top + _output->info()->padding().bottom; + // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor + ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0))); + cl::Image2D input1_image2d; if(_export_to_cl_image) @@ -414,28 +416,28 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co add_2D_tensor_argument(idx, _output, slice); // LHS stride_z - _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[lhs_idx_batch_size])); // RHS stride_z (not used if _export_to_cl_image == true) - _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[rhs_idx_batch_size])); // Bias stride_z (if _add_bias == true) if(_add_bias) { - _kernel.setArg(idx++, static_cast(_input2->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_input2->info()->strides_in_bytes()[bia_idx_batch_size])); } // Output stride_z - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[out_idx_batch_size])); // Cross-plan padding (if _reinterpret_input_as_3d = true) - if(_reinterpret_input_as_3d) + if(_reinterpret_input_as_3d && _has_pad_y) { _kernel.setArg(idx++, static_cast(total_cross_plane_pad_lhs)); } // Cross-plan padding (if _reinterpret_output_as_3d = true) - if(_reinterpret_output_as_3d) + if(_reinterpret_output_as_3d && _has_pad_y) { _kernel.setArg(idx++, static_cast(total_cross_plane_pad_out)); } -- cgit v1.2.1