From 96880cf00707d394938ec7fe31c21c79a2ac3f0c Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 20 Oct 2017 18:52:20 +0100 Subject: COMPMID-640: FullyConnectedLayer failures on both NEON/CL Change-Id: Idd830cff054114123229c189e423b753b8064146 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/92623 Reviewed-by: Robert Hughes Tested-by: Kaizen Reviewed-by: Anthony Barbier --- src/core/CL/cl_kernels/gemm.cl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index 35a2e4704f..7f2a08bc2c 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -1040,7 +1040,7 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0), VECTOR_TYPE acc3 = 0.0f; #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 - for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y)) + for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y)) { // Load values from matrix A VEC_DATA_TYPE(DATA_TYPE, 2) @@ -1344,7 +1344,7 @@ __kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0), #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 // This for loop performs 4 accumulations per iteration - for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y)) + for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y)) { short2 a0 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 @@ -1629,7 +1629,7 @@ __kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0), float4 acc = 0.0f; - for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y)) + for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y)) { float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0)); float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); -- cgit v1.2.1