aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/depthwise_convolution.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CL/cl_kernels/depthwise_convolution.cl')
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution.cl41
1 files changed, 24 insertions, 17 deletions
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index c55a3d91c2..fb4a0fc157 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -280,16 +280,17 @@ inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
* @return a float2 containing 2 convoluted values.
*/
inline float2 convolution3x3(
- Image *src,
+ __global const uchar *src,
+ unsigned int src_stride_y,
const float mat0, const float mat1, const float mat2,
const float mat3, const float mat4, const float mat5,
const float mat6, const float mat7, const float mat8)
{
float2 pixels;
- pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
- pixels += convolution1x3(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
- pixels += convolution1x3(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
+ pixels = convolution1x3((src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
+ pixels += convolution1x3((src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
+ pixels += convolution1x3((src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
return pixels;
}
@@ -341,27 +342,33 @@ __kernel void depthwise_convolution_3x3(
Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-#if defined(HAS_BIAS)
- Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif //defined(HAS_BIAS)
+
+ float2 pixels = 0.0f;
// Extract channel and linearized batch indices
const int channel = get_global_id(2) % DST_CHANNELS;
const int batch = get_global_id(2) / DST_CHANNELS;
// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
- src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+
__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
- uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
- float3 weights_values0 = vload3(0, (__global float *)(weights_addr + offset.s0));
- float3 weights_values1 = vload3(0, (__global float *)(weights_addr + offset.s1));
- float3 weights_values2 = vload3(0, (__global float *)(weights_addr + offset.s2));
+ __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
- float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
- weights_values1.s0, weights_values1.s1, weights_values1.s2,
- weights_values2.s0, weights_values2.s1, weights_values2.s2);
+ // Load the weights
+ float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+ float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+ float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+
+ pixels = convolution3x3(src_addr, src_stride_y,
+ weights_values0.s0, weights_values0.s1, weights_values0.s2,
+ weights_values1.s0, weights_values1.s1, weights_values1.s2,
+ weights_values2.s0, weights_values2.s1, weights_values2.s2);
#if defined(HAS_BIAS)
- pixels += (float2)(*((__global float *)(biases.ptr + channel * biases_stride_x)));
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ float bias = *((__global float *)(vector_offset(&biases, channel)));
+
+ pixels += (float2)bias;
#endif //defined(HAS_BIAS)
vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
@@ -1862,4 +1869,4 @@ __kernel void depthwise_convolution_3x3_nhwc_stride1(
}
#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
+#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE) \ No newline at end of file