aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2019-12-20 13:26:08 +0000
committerMichele Di Giorgio <michele.digiorgio@arm.com>2020-01-14 17:21:26 +0000
commitcbbed288a71f2f048123db3cf396361e5d66ce93 (patch)
treebd48122e283272adb668a42097ea69ca3f0473a6 /src/core/CL/cl_kernels
parent70d33bdfd36e1b44b0573189dca67ed7c63dd59e (diff)
downloadComputeLibrary-cbbed288a71f2f048123db3cf396361e5d66ce93.tar.gz
COMPMID-2991: Add support for QASYMM8_SIGNED in CL kernels/functions - part 2
Adding support for QASYMM8_SIGNED to the following CL kernels/functions: - CLActivationLayerKernel/CLActivationLayer - CLComparisonKernel/CLComparison - CLConvertFullyConnectedWeightsKernel/CLConvertFullyConnectedWeights - CLDeconvolutionLayerUpsampleKernel/CLDeconvolutionLayerUpsample - CLDepthToSpaceLayerKernel/CLDepthToSpaceLayer - CLDequantizationLayerKernel/CLDequantizationLayer - CLGEMMMatrixVectorMultiplyKernel - CLNormalizePlanarYUVLayerKernel - CLPReluLayer - CLPixelWiseMultiplicationKernel/CLPixelWiseMultiplication - CLPoolingLayerKernel/CLPoolingLayer Change-Id: I874bbb7c2b08baa9c5ff4c9e6bc8778b42a6bec5 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/2539 Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r--src/core/CL/cl_kernels/comparisons.cl11
-rw-r--r--src/core/CL/cl_kernels/convert_fc_weights.cl4
-rw-r--r--src/core/CL/cl_kernels/deconvolution_layer.cl4
-rw-r--r--src/core/CL/cl_kernels/depth_to_space.cl6
-rw-r--r--src/core/CL/cl_kernels/dequantization_layer.cl4
-rw-r--r--src/core/CL/cl_kernels/gemv.cl24
-rw-r--r--src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl6
-rw-r--r--src/core/CL/cl_kernels/pixelwise_mul_int.cl8
-rw-r--r--src/core/CL/cl_kernels/pooling_layer.cl18
-rw-r--r--src/core/CL/cl_kernels/pooling_layer_quantized.cl67
10 files changed, 75 insertions, 77 deletions
diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/comparisons.cl
index 8824b136b2..a41b7e2966 100644
--- a/src/core/CL/cl_kernels/comparisons.cl
+++ b/src/core/CL/cl_kernels/comparisons.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@
* @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
* @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
*
- * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: All non-quantized data types.
* @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -93,12 +93,13 @@ __kernel void DEFINE_KERNEL(OP_NAME)(
#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2)
/** This function compares two quantized tensors.
*
+ * @note The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
* @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
* @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
* @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
* @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
*
- * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: All quantized data types.
* @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -133,8 +134,8 @@ __kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)(
Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
- int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
- int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
+ int16 in_a = CONVERT(vload16(0, (__global DATA_TYPE *)in1.ptr), int16);
+ int16 in_b = CONVERT(vload16(0, (__global DATA_TYPE *)in2.ptr), int16);
in_a = in_a - (int16)((int)OFFSET_IN1);
in_b = in_b - (int16)((int)OFFSET_IN2);
diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/convert_fc_weights.cl
index d47b733acd..db0873755e 100644
--- a/src/core/CL/cl_kernels/convert_fc_weights.cl
+++ b/src/core/CL/cl_kernels/convert_fc_weights.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,7 +32,7 @@
* @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
* @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: U8, S8, QASYMM8, U16, S16, U32, S32, F16, F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: All.
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
index ea2455c613..a9a6ac1947 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/deconvolution_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
/** This function applies upsample on an input image.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8/F16/F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: All.
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/depth_to_space.cl
index 2ffd0a40e7..5c2e8a1d57 100644
--- a/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/src/core/CL/cl_kernels/depth_to_space.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,7 @@
* @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
* @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
*
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All.
* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -72,7 +72,7 @@ __kernel void depth_to_space_nchw(
* @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
* @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
*
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All.
* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
index 7550b4ba76..add86e3f2e 100644
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ b/src/core/CL/cl_kernels/dequantization_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
* @note Quantization scale of input tensor is passed in with -DSCALE=scale.
* @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
*
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QSYMM8
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl
index 811aa1b865..aabde4119f 100644
--- a/src/core/CL/cl_kernels/gemv.cl
+++ b/src/core/CL/cl_kernels/gemv.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -110,12 +110,12 @@ __kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VEC
}
}
}
-#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
-#if defined(SRC_WIDTH) && defined(SRC_HEIGHT)
/** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uchar
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -123,13 +123,13 @@ __kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VEC
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] weights_ptr Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: S32
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
@@ -158,14 +158,14 @@ __kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
// This kernel handle 4 rows in per thread so that it can reuse the weights
for(int i = 0; i < SRC_WIDTH; i += 4)
{
- int4 w = convert_int4(vload4(0, (__global uchar *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
+ int4 w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
- int4 tmp0 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s0))) + (int4)input_offset;
- int4 tmp1 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s1))) + (int4)input_offset;
- int4 tmp2 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s2))) + (int4)input_offset;
- int4 tmp3 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s3))) + (int4)input_offset;
+ int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0))) + (int4)input_offset;
+ int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1))) + (int4)input_offset;
+ int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2))) + (int4)input_offset;
+ int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3))) + (int4)input_offset;
// Accumulate
acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3;
@@ -197,4 +197,4 @@ __kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
}
}
}
-#endif /* defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
+#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
index 925975d2ba..b2ba65f812 100644
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,7 +39,7 @@
* @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
* @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
*
- * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
* @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
* @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
@@ -102,7 +102,7 @@ __kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
* @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
* @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
*
- * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
* @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
* @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index 989316d661..d277c6c56f 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -109,7 +109,7 @@ __kernel void pixelwise_mul_int(
* @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
* @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
*
- * @param[in] in1_ptr Pointer to the source image. Supported data types: QASYMM8/QSYMM16
+ * @param[in] in1_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
* @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
@@ -117,7 +117,7 @@ __kernel void pixelwise_mul_int(
* @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
* @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr
* @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
@@ -125,7 +125,7 @@ __kernel void pixelwise_mul_int(
* @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
* @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr
* @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index c8b5e07b47..207669e43e 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -391,28 +391,16 @@ __kernel void pooling_layer_optimized_3(
#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-// Set the initial value for the pooling operation accordingly with the data type
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define INITIAL_VALUE 0
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#if FP16
-#define INITIAL_VALUE -HALF_MAX
-#else // FP16
-#define INITIAL_VALUE -FLT_MAX
-#endif // FP16
-
-#endif // POOL_AVG
-
/** Performs a pooling function of pool size equal to N (NCHW)
*
* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note -DFP16 must be passed at compile time if half float data type is used
* @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
* @note In case of average pooling the following information must be passed at compile time:
* -DPOOL_AVG must be provided otherwise max pooling will be performed.
* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
*
* @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
@@ -519,13 +507,13 @@ ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_siz
/** Performs a pooling function of pool size equal to N (NHWC)
*
* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
- * @note -DFP16 must be passed at compile time if half float data type is used
* @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
* @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
* @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
* @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
* @note In case of average pooling the following information must be passed at compile time:
* -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
*
* @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 2df22d736c..3a370eea93 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,18 +23,19 @@
*/
#include "helpers.h"
+#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
+#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-#define VEC_FLOAT(VEC_SIZE) \
- VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR(VEC_SIZE) VEC_DATA_TYPE(uchar, VEC_SIZE)
#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res) \
{ \
const VEC_FLOAT(VEC_SIZE) in_f32 = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset)); \
- res = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_UCHAR(VEC_SIZE)); \
+ res = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_TYPE(VEC_SIZE)); \
}
#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
@@ -74,8 +75,10 @@ int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int
* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
@@ -100,8 +103,8 @@ __kernel void pooling_layer_MxN_quantized_nchw(
Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
- int8 vdata = 0;
- int sdata = 0;
+ int8 vdata = INITIAL_VALUE;
+ int sdata = INITIAL_VALUE;
// Load data
for(int y = 0; y < POOL_SIZE_Y; y++)
@@ -109,17 +112,18 @@ __kernel void pooling_layer_MxN_quantized_nchw(
int x = 0;
for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
{
- uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, x, y, 0));
- int8 data0 = convert_int8(data);
- vdata = POOL_OP(vdata, data0);
+ VEC_TYPE(8)
+ data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+ int8 data0 = convert_int8(data);
+ vdata = POOL_OP(vdata, data0);
}
// Leftover
for(; x < (int)POOL_SIZE_X; ++x)
{
- uchar data = *((__global uchar *)tensor3D_offset(&input, x, y, 0));
- int data0 = convert_int(data);
- sdata = POOL_OP(sdata, data0);
+ DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+ int data0 = convert_int(data);
+ sdata = POOL_OP(sdata, data0);
}
}
@@ -133,22 +137,22 @@ __kernel void pooling_layer_MxN_quantized_nchw(
res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
#endif /* defined(POOL_AVG) */
- uchar result_u8 = convert_uchar(res);
+ DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
- const float result_f32 = convert_float(result_u8);
+ const float result_f32 = convert_float(result_q8);
const float input_offset = (float)OFFSET_IN1;
const float input_scale = (float)SCALE_IN1;
const float scale_out = (float)SCALE_OUT;
const float offset_out = (float)OFFSET_OUT;
const float in_f32 = (result_f32 - input_offset) * input_scale;
const float out_f32 = in_f32 / scale_out + offset_out;
- result_u8 = convert_uchar_sat(convert_int_rte(out_f32));
+ result_q8 = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
- *(__global uchar *)output.ptr = result_u8;
+ *(__global DATA_TYPE *)output.ptr = result_q8;
}
int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
@@ -158,7 +162,7 @@ int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int u
#if defined(DST_DEPTH)
int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
#else /* defined(DST_DEPTH) */
- int start_y = get_global_id(2) * stride_y - pad_y;
+ int start_y = get_global_id(2) * stride_y - pad_y;
#endif /* defined(DST_DEPTH) */
const int end_x = min(start_x + pool_size_x, upper_bound_w);
@@ -178,8 +182,9 @@ int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int u
* @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
* @note In case of average pooling the following information must be passed at compile time:
* -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
@@ -209,17 +214,17 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
#else /* defined(DST_DEPTH) */
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
#endif /* defined(DST_DEPTH) */
- int8 vdata = 0;
+ int8 vdata = INITIAL_VALUE;
const int idx_width = get_global_id(1) * STRIDE_X;
#if defined(DST_DEPTH)
const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
#else /* defined(DST_DEPTH) */
- const int idx_height = get_global_id(2) * STRIDE_Y;
+ const int idx_height = get_global_id(2) * STRIDE_Y;
#endif /* defined(DST_DEPTH) */
for(int y = 0; y < POOL_SIZE_Y; ++y)
@@ -231,9 +236,11 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
x1 = select(x1, PAD_X - idx_width - 1, y != y1);
#if defined(DST_DEPTH)
- uchar8 data = vload8(0, (__global uchar *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
+ VEC_TYPE(8)
+ data = vload8(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
#else /* defined(DST_DEPTH) */
- uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+ VEC_TYPE(8)
+ data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
#endif /* defined(DST_DEPTH) */
int8 data0 = convert_int8(data);
@@ -246,11 +253,13 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
vdata = convert_int8(round(DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))));
#endif /* defined(POOL_AVG) */
- uchar8 out_u8 = convert_uchar8(vdata);
+ VEC_TYPE(8)
+ out_q8 = CONVERT(vdata, VEC_TYPE(8));
#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
- REQUANTIZE(8, out_u8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_u8);
+ REQUANTIZE(8, out_q8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q8);
#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
// Store result
- vstore8(out_u8, 0, (__global uchar *)output.ptr);
+ vstore8(out_q8, 0, (__global DATA_TYPE *)output.ptr);
}
+#endif /* defined(DATA_TYPE) && defined(INITIAL_VALUE) */