From 410bca42f560c87d4860dc5ae7374437ded2cd76 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Thu, 22 Oct 2020 11:07:33 +0100 Subject: COMPMID-3599: Fix OpenCL gemmlowp_offset_contribution kernel The kernel was not using the preprocessor arguments needed avoiding the use of padding. Change-Id: I6b5fdf4f3f14edbef60b9d5b60179d619700bc00 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4232 Reviewed-by: Giorgio Arena Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/gemmlowp.cl | 50 +++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index 97150e05a2..950faeca0b 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -1486,44 +1486,44 @@ __kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result) const int z = get_global_id(2); // Compute offset contribution - int4 offset_term_s32 = offset_contribution( - x, y, z + VEC_INT offset_term_s32 = offset_contribution( + x, y, z #if defined(A_OFFSET) - , - sum_col_ptr, - sum_col_stride_x, - sum_col_step_x, - sum_col_stride_y, - sum_col_step_y, - sum_col_offset_first_element_in_bytes + , + sum_col_ptr, + sum_col_stride_x, + sum_col_step_x, + sum_col_stride_y, + sum_col_step_y, + sum_col_offset_first_element_in_bytes #endif // defined(A_OFFSET) #if defined(B_OFFSET) - , - sum_row_ptr, - sum_row_stride_x, - sum_row_step_x, - sum_row_stride_y, - sum_row_step_y, - sum_row_offset_first_element_in_bytes + , + sum_row_ptr, + sum_row_stride_x, + sum_row_step_x, + sum_row_stride_y, + sum_row_step_y, + sum_row_offset_first_element_in_bytes #endif // defined(B_OFFSET) #if defined(ADD_BIAS) - , - biases_ptr, - biases_stride_x, - biases_step_x, - biases_offset_first_element_in_bytes + , + biases_ptr, + biases_stride_x, + biases_step_x, + biases_offset_first_element_in_bytes #endif // defined(ADD_BIAS) - ); + ); __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z; - int4 in_s32 = vload4(0, (__global int *)mm_result_addr); + VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr); // Add the offset terms to GEMM's result - in_s32 += offset_term_s32; + in_s32_0 += offset_term_s32; // Store the result with the offset contribution - vstore4(in_s32, 0, (__global int *)mm_result_addr); + STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } #if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE) -- cgit v1.2.1