aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution_quantized.cl65
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp8
2 files changed, 37 insertions, 36 deletions
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 0cd4e7148e..8a757fc2bd 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,22 +45,23 @@
* @param[in] input_offset Quantized offset of zero point of the input tensor data range
* @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
*
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
*/
-inline int2 convolution1x3_stride_1(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_1(__global const uchar *left_pixel,
const int left_coeff,
const int middle_coeff,
const int right_coeff,
const int input_offset,
const int weight_offset)
{
- int4 temp = CONVERT(vload4(0, left_pixel), int4);
+ int8 temp0 = CONVERT(vload8(0, left_pixel), int8);
+ int2 temp1 = CONVERT(vload2(0, (left_pixel + 8 * sizeof(uchar))), int2);
- int2 left = CONVERT(temp.s01, int2);
- int2 middle = CONVERT(temp.s12, int2);
- int2 right = CONVERT(temp.s23, int2);
+ int8 left = CONVERT(temp0.s01234567, int8);
+ int8 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8);
+ int8 right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8);
- return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+ return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
}
/** Compute a 1D horizontal convolution of size 3 and stride 2 for uchar type.
@@ -72,23 +73,23 @@ inline int2 convolution1x3_stride_1(__global const uchar *left_pixel,
* @param[in] input_offset Quantized offset of zero point of the input tensor data range
* @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
*
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
*/
-inline int2 convolution1x3_stride_2(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_2(__global const uchar *left_pixel,
const int left_coeff,
const int middle_coeff,
const int right_coeff,
const int input_offset,
const int weight_offset)
{
- int4 temp0 = CONVERT(vload4(0, left_pixel), int4);
- int temp1 = CONVERT(*(left_pixel + 4 * sizeof(uchar)), int);
+ int16 temp0 = CONVERT(vload16(0, left_pixel), int16);
+ int temp1 = CONVERT(*(left_pixel + 16 * sizeof(uchar)), int);
- int2 left = CONVERT(temp0.s02, int2);
- int2 middle = CONVERT(temp0.s13, int2);
- int2 right = CONVERT((int2)(temp0.s2, temp1), int2);
+ int8 left = CONVERT(temp0.s02468ace, int8);
+ int8 middle = CONVERT(temp0.s13579bdf, int8);
+ int8 right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8);
- return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+ return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
}
/** Compute a 1D horizontal convolution of size 3 and stride 3 for uchar type.
@@ -100,23 +101,23 @@ inline int2 convolution1x3_stride_2(__global const uchar *left_pixel,
* @param[in] input_offset Quantized offset of zero point of the input tensor data range
* @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
*
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
*/
-inline int2 convolution1x3_stride_3(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_3(__global const uchar *left_pixel,
const int left_coeff,
const int middle_coeff,
const int right_coeff,
const int input_offset,
const int weight_offset)
{
- int4 temp0 = CONVERT(vload4(0, left_pixel), int4);
- int2 temp1 = CONVERT(vload2(0, (left_pixel + 4 * sizeof(uchar))), int2);
+ int16 temp0 = CONVERT(vload16(0, left_pixel), int16);
+ int8 temp1 = CONVERT(vload8(0, (left_pixel + 16 * sizeof(uchar))), int8);
- int2 left = CONVERT(temp0.s03, int2);
- int2 middle = CONVERT((int2)(temp0.s1, temp1.s0), int2);
- int2 right = CONVERT((int2)(temp0.s2, temp1.s1), int2);
+ int8 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);
+ int8 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8);
+ int8 right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8);
- return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+ return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
}
/** Apply a 3x3 convolution matrix to a single channel QASYMM8 input image and return the result.
@@ -144,9 +145,9 @@ inline int2 convolution1x3_stride_3(__global const uchar *left_pixel,
* @param[in] output_shift Output scale divisor exponent
* @param[in] bias (Optional) Bias value
*
- * @return a uchar2 containing 2 convoluted values.
+ * @return a uchar8 containing 8 convoluted values.
*/
-inline uchar2 convolution3x3(
+inline uchar8 convolution3x3(
Image *src,
const uchar mat0, const uchar mat1, const uchar mat2,
const uchar mat3, const uchar mat4, const uchar mat5,
@@ -159,20 +160,20 @@ inline uchar2 convolution3x3(
#endif //defined(HAS_BIAS)
)
{
- int2 pixels;
+ int8 pixels;
pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2, input_offset, weight_offset);
pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5, input_offset, weight_offset);
pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8, input_offset, weight_offset);
#if defined(HAS_BIAS)
- pixels += (int2)(bias);
+ pixels += (int8)(bias);
#endif //defined(HAS_BIAS)
- pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 2);
+ pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 8);
pixels = pixels + output_offset;
pixels = clamp(pixels, 0, 255);
- return CONVERT(pixels, uchar2);
+ return CONVERT(pixels, uchar8);
}
/** This function computes the horizontal integral of the image.
@@ -241,7 +242,7 @@ __kernel void depthwise_convolution_3x3_quantized(
int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
#endif //defined(HAS_BIAS)
- uchar2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
+ uchar8 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
weights_values1.s0, weights_values1.s1, weights_values1.s2,
weights_values2.s0, weights_values2.s1, weights_values2.s2,
input_offset, weight_offset, output_offset,
@@ -252,7 +253,7 @@ __kernel void depthwise_convolution_3x3_quantized(
#endif //defined(HAS_BIAS)
);
- vstore2(pixels, 0, dst.ptr);
+ vstore8(pixels, 0, dst.ptr);
}
#endif //defined(CONV_STRIDE_X)
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
index f9229ba294..1c0fe9984f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -158,9 +158,9 @@ void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, con
}
// Configure kernel window
- const unsigned int num_elems_processed_per_iteration = 2;
- const unsigned int num_elems_written_per_iteration = 2;
- const unsigned int num_elems_read_per_iteration = 3 + _conv_stride_x;
+ const unsigned int num_elems_processed_per_iteration = 8 / data_size_from_type(input->info()->data_type());
+ const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration;
+ const unsigned int num_elems_read_per_iteration = 3 + (num_elems_processed_per_iteration - 1) * _conv_stride_x;
const unsigned int num_rows_read_per_iteration = 3;
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));