aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-08-01 20:16:34 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commit4e0d3819be6c61cc00c7e0fa9b4b740738c703b7 (patch)
tree3bececa31ef77a4b1494af9b49e82dfd5c0c3a30 /src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
parentef915165692da33a37066dab467ba3ceade53f27 (diff)
downloadComputeLibrary-4e0d3819be6c61cc00c7e0fa9b4b740738c703b7.tar.gz
COMPMID-1437: (Nightly) OCLGrind failures in CLDepthwiseConvolution QA8 nhwc
Change-Id: I2c1e69b4654e928d8e7e9071258194f258bb6935 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/142368 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/depthwise_convolution_quantized.cl')
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution_quantized.cl52
1 files changed, 26 insertions, 26 deletions
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index ca8efcdd87..fe902ed981 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -614,7 +614,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
int z_coord = 0;
int4 offset = 0;
- const int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
+ const int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
// We compute 2x1x1 [C,W,H] elements
VEC_INT acc = 0, sum = 0;
@@ -631,9 +631,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
#if INPUT_OFFSET != 0
- VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT) + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT) + CONVERT(w6, VEC_INT) + CONVERT(w7,
- VEC_INT)
- + CONVERT(w8, VEC_INT);
+ VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
+ + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
+ + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
#endif /* INPUT_OFFSET != 0 */
// Load input values
@@ -641,10 +641,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
// Clamp z_coord as for z = 0, it can be negative
// z_coord is casted to unsigned int in order to use just a min() operation
// A "-1" 32 bit signed variable converted to unsigned gives 4294967295
- z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
+ z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;
z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
offset = y_offset + (int4)(z_coord * src_stride_z);
- offset = min(offset, max_offset);
+ offset = min(offset, (int4)max_offset);
VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
@@ -653,7 +653,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
// z == 1
// z_coord can be only negative for z = 0 so we do not need to clamp it
// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
- z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
+ z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
offset = y_offset + (int4)(z_coord * src_stride_z);
VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
@@ -663,7 +663,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
// After z = 1 we can simply add src_stride_z to offset without updating z_coord
// However offset can be out-of-bound so we need to check if it is greater than max_offset
offset += (int4)src_stride_z;
- offset = min(offset, max_offset);
+ offset = min(offset, (int4)max_offset);
VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
@@ -770,7 +770,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
int z_coord = 0;
int4 offset = 0;
- int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
+ int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
// We compute 2x2x2 [C,W,H] elements
VEC_INT acc0 = 0, sum0 = 0;
@@ -790,9 +790,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
#if INPUT_OFFSET != 0
- VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT) + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT) + CONVERT(w6, VEC_INT) + CONVERT(w7,
- VEC_INT)
- + CONVERT(w8, VEC_INT);
+ VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
+ + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
+ + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
#endif /* INPUT_OFFSET != 0 */
// Load input values
@@ -800,10 +800,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
// Clamp z_coord as for z = 0, it can be negative
// z_coord is casted to unsigned int in order to use just a min() operation
// A "-1" 32 bit signed variable converted to unsigned gives 4294967295
- z_coord = z * NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
+ z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
offset = y_offset + (int4)(z_coord * src_stride_z);
- offset = min(offset, max_offset);
+ offset = min(offset, (int4)max_offset);
VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
@@ -813,7 +813,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
// z == 1
// z_coord can be only negative for z = 0 so we do not need to clamp it
// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
- z_coord = z * NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
+ z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
offset = y_offset + (int4)(z_coord * src_stride_z);
VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
@@ -824,7 +824,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
// After z = 1 we can simply add src_stride_z to offset without updating z_coord
// However offset can be out-of-bound so we need to check if it is greater than max_offset
offset += (int4)src_stride_z;
- offset = min(offset, max_offset);
+ offset = min(offset, (int4)max_offset);
VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
@@ -834,7 +834,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
// After z = 1 we can simply add src_stride_z to offset without updating z_coord
// However offset can be out-of-bound so we need to check if it is greater than max_offset
offset += (int4)(src_stride_z);
- offset = min(offset, max_offset);
+ offset = min(offset, (int4)max_offset);
VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
@@ -1013,7 +1013,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
int z_coord = 0;
int4 offset = 0;
- int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
+ int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
// We compute 2x2x2 [C,W,H] elements
VEC_INT acc0 = 0, sum0 = 0;
@@ -1033,9 +1033,9 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
#if INPUT_OFFSET != 0
- VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT) + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT) + CONVERT(w6, VEC_INT) + CONVERT(w7,
- VEC_INT)
- + CONVERT(w8, VEC_INT);
+ VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
+ + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
+ + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
#endif /* INPUT_OFFSET != 0 */
// Load input values
@@ -1043,10 +1043,10 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
// Clamp z_coord as for z = 0, it can be negative
// z_coord is casted to unsigned int in order to use just a min() operation
// A "-1" 32 bit signed variable converted to unsigned gives 4294967295
- z_coord = z * NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
+ z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
offset = y_offset + (int4)(z_coord * src_stride_z);
- offset = min(offset, max_offset);
+ offset = min(offset, (int4)max_offset);
VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
@@ -1056,7 +1056,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
// z == 1
// z_coord can be only negative for z = 0 so we do not need to clamp it
// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
- z_coord = z * NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
+ z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
offset = y_offset + (int4)(z_coord * src_stride_z);
VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
@@ -1067,7 +1067,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
// After z = 1 we can simply add src_stride_z to offset without updating z_coord
// However offset can be out-of-bound so we need to check if it is greater than max_offset
offset += (int4)src_stride_z;
- offset = min(offset, max_offset);
+ offset = min(offset, (int4)max_offset);
VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
@@ -1077,7 +1077,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
// After z = 1 we can simply add src_stride_z to offset without updating z_coord
// However offset can be out-of-bound so we need to check if it is greater than max_offset
offset += (int4)(src_stride_z);
- offset = min(offset, max_offset);
+ offset = min(offset, (int4)max_offset);
VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);