aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/winograd_input_transform.cl
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-07-18 18:06:32 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commitdbbe4a31f8eac4bceffa3805640c658bed25ce18 (patch)
tree63dcf7c1d3e324786d68bc72ee54fa2702026e3a /src/core/CL/cl_kernels/winograd_input_transform.cl
parente1553374d037dbf84999258d5bc88927891770cc (diff)
downloadComputeLibrary-dbbe4a31f8eac4bceffa3805640c658bed25ce18.tar.gz
COMPMID-1412; (Nightly) OCLGrind failures in input_transform CL NHWC
Change-Id: I14d19be90b6c56f2259a2c94fd793829cfb55328 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140538 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/winograd_input_transform.cl')
-rw-r--r--src/core/CL/cl_kernels/winograd_input_transform.cl150
1 files changed, 75 insertions, 75 deletions
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
index fcd1b3b9ce..da18e4ab5b 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_input_transform.cl
@@ -616,8 +616,8 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
// Clamp coordinates. This clamp is valid for all rows
int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
int2 y_coord1 = (int2)(y * OUTPUT_TILE_W) + (int2)(4, 5) - (int2)PAD_LEFT;
- y_coord0 = clamp(y_coord0, -1, SRC_DIM_1);
- y_coord1 = clamp(y_coord1, -1, SRC_DIM_1);
+ y_coord0 = clamp(y_coord0, (int4) - 1, (int4)SRC_DIM_1);
+ y_coord1 = clamp(y_coord1, (int2) - 1, (int2)SRC_DIM_1);
int z_coord;
int4 valid_y0;
@@ -625,17 +625,17 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Row4
- z_coord = (z * 4) - PAD_TOP + 4;
+ z_coord = (z * 4) - (int)PAD_TOP + 4;
// If z < 0, set y to -1
- valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
- valid_y1 = select(y_coord1, -1, (int2)z_coord < 0);
+ valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
+ valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
// If z >= SRC_DIM_2, set y to SRC_DIM_2
- valid_y0 = select(valid_y0, SRC_DIM_1, (int4)z_coord >= SRC_DIM_2);
- valid_y1 = select(valid_y1, SRC_DIM_1, (int2)z_coord >= SRC_DIM_2);
+ valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
+ valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
// Clamp z coordinate
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
float d40 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
float d41 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
@@ -661,14 +661,14 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
#if !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Row0
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 0;
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
#if PAD_TOP != 0
- valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
- valid_y1 = select(y_coord1, -1, (int2)z_coord < 0);
- valid_y0 = select(valid_y0, SRC_DIM_1, (int4)z_coord >= SRC_DIM_2);
- valid_y1 = select(valid_y1, SRC_DIM_1, (int2)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
+ valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
+ valid_y0 = select(valid_y0, (int)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
+ valid_y1 = select(valid_y1, (int)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
#else // PAD_TOP != 0
valid_y0 = y_coord0;
valid_y1 = y_coord1;
@@ -689,8 +689,8 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
valid_y0 = select(valid_y0, (int4)SRC_DIM_1, z_coords0 >= (int4)SRC_DIM_2);
valid_y1 = select(valid_y1, (int2)SRC_DIM_1, z_coords1 >= (int2)SRC_DIM_2);
- z_coords0 = clamp((int4)z_coords0, (int4)0, (int4)(SRC_DIM_2 - 1));
- z_coords1 = clamp((int2)z_coords1, (int2)0, (int2)(SRC_DIM_2 - 1));
+ z_coords0 = clamp((int4)z_coords0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
+ z_coords1 = clamp((int2)z_coords1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
float d00 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
float d01 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
@@ -709,12 +709,12 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Row2
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 2;
- valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
- valid_y1 = select(y_coord1, -1, (int2)z_coord < 0);
- valid_y0 = select(valid_y0, SRC_DIM_1, (int4)z_coord >= SRC_DIM_2);
- valid_y1 = select(valid_y1, SRC_DIM_1, (int2)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
+ valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
+ valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
+ valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
+ valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
float d20 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
float d21 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
@@ -782,7 +782,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Row1
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 1;
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
// Row1 can never be out of bounds
valid_y0 = y_coord0;
valid_y1 = y_coord1;
@@ -795,13 +795,13 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
float d15 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
// Row3
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 3;
- valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
- valid_y1 = select(y_coord1, -1, (int2)z_coord < 0);
- valid_y0 = select(valid_y0, SRC_DIM_1, (int4)z_coord >= SRC_DIM_2);
- valid_y1 = select(valid_y1, SRC_DIM_1, (int2)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
+ valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
+ valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
+ valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
+ valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
float d30 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
float d31 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
@@ -918,13 +918,13 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
dst_addr += dst_plane_stride;
// Row5
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 5;
- valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
- valid_y1 = select(y_coord1, -1, (int2)z_coord < 0);
- valid_y0 = select(valid_y0, SRC_DIM_1, (int4)z_coord >= SRC_DIM_2);
- valid_y1 = select(valid_y1, SRC_DIM_1, (int2)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
+ valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
+ valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
+ valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
+ valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
float d50 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
float d51 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
@@ -998,7 +998,7 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
// Clamp coordinates. This clamp is valid for all rows
int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
- y_coord = clamp(y_coord, -1, SRC_DIM_1);
+ y_coord = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
// Row0
// We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels
@@ -1025,14 +1025,14 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
// We can skip the border clamping along the y dimension as we cannot read out-of-bound in case of 1x5 kernels
- int y_coord = y * OUTPUT_TILE_W;
+ int y_coord = y * (int)OUTPUT_TILE_W;
// Row0
// We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels
int8 z_coord = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
- int8 valid_y = select((int8)y_coord, (int8) - 1, z_coord < (int8)0); // If z < 0, set y to -1
- valid_y = select(valid_y, SRC_DIM_1, z_coord >= (int8)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1); // Clamp z coordinate
+ int8 valid_y = select((int8)y_coord, (int8) - 1, z_coord < (int8)0); // If z < 0, set y to -1
+ valid_y = select(valid_y, (int8)SRC_DIM_1, z_coord >= (int8)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+ z_coord = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1); // Clamp z coordinate
// Load the input tile
float8 in_row0;
@@ -1057,13 +1057,13 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
// Clamp coordinates. This clamp is valid for all rows
int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
- y_coord = clamp(y_coord, -1, SRC_DIM_1);
+ y_coord = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
// Row0
- int z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 0;
- int8 valid_y = select(y_coord, -1, (int8)z_coord < 0); // If z < 0, set y to -1
- valid_y = select(valid_y, SRC_DIM_1, (int8)z_coord >= SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1); // Clamp z coordinate
+ int z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
+ int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0); // If z < 0, set y to -1
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1); // Clamp z coordinate
// Load the input tile
in_row0.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
@@ -1076,10 +1076,10 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
in_row0.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row1
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 1;
- valid_y = select(y_coord, -1, (int8)z_coord < 0);
- valid_y = select(valid_y, SRC_DIM_1, (int8)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
in_row1.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
in_row1.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
@@ -1091,10 +1091,10 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
in_row1.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row2
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 2;
- valid_y = select(y_coord, -1, (int8)z_coord < 0);
- valid_y = select(valid_y, SRC_DIM_1, (int8)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
in_row2.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
in_row2.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
@@ -1106,10 +1106,10 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
in_row2.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row3
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 3;
- valid_y = select(y_coord, -1, (int8)z_coord < 0);
- valid_y = select(valid_y, SRC_DIM_1, (int8)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
in_row3.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
in_row3.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
@@ -1121,10 +1121,10 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
in_row3.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row4
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 4;
- valid_y = select(y_coord, -1, (int8)z_coord < 0);
- valid_y = select(valid_y, SRC_DIM_1, (int8)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
in_row4.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
in_row4.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
@@ -1136,10 +1136,10 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
in_row4.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row5
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 5;
- valid_y = select(y_coord, -1, (int8)z_coord < 0);
- valid_y = select(valid_y, SRC_DIM_1, (int8)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
in_row5.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
in_row5.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
@@ -1151,10 +1151,10 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
in_row5.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row6
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 6;
- valid_y = select(y_coord, -1, (int8)z_coord < 0);
- valid_y = select(valid_y, SRC_DIM_1, (int8)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
in_row6.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
in_row6.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
@@ -1166,10 +1166,10 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
in_row6.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row7
- z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 7;
- valid_y = select(y_coord, -1, (int8)z_coord < 0);
- valid_y = select(valid_y, SRC_DIM_1, (int8)z_coord >= SRC_DIM_2);
- z_coord = clamp(z_coord, 0, SRC_DIM_2 - 1);
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
in_row7.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
in_row7.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);