diff options
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r-- | src/core/CL/cl_kernels/comparisons.cl | 11 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/convert_fc_weights.cl | 4 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/deconvolution_layer.cl | 4 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/depth_to_space.cl | 6 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/dequantization_layer.cl | 4 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/gemv.cl | 24 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl | 6 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/pixelwise_mul_int.cl | 8 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/pooling_layer.cl | 18 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/pooling_layer_quantized.cl | 67 |
10 files changed, 75 insertions, 77 deletions
diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/comparisons.cl index 8824b136b2..a41b7e2966 100644 --- a/src/core/CL/cl_kernels/comparisons.cl +++ b/src/core/CL/cl_kernels/comparisons.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS * - * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32 + * @param[in] in1_ptr Pointer to the source tensor. Supported data types: All non-quantized data types. * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -93,12 +93,13 @@ __kernel void DEFINE_KERNEL(OP_NAME)( #if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) /** This function compares two quantized tensors. * + * @note The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10 * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10 * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10 * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10 * - * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] in1_ptr Pointer to the source tensor. Supported data types: All quantized data types. * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -133,8 +134,8 @@ __kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)( Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16); - int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16); + int16 in_a = CONVERT(vload16(0, (__global DATA_TYPE *)in1.ptr), int16); + int16 in_b = CONVERT(vload16(0, (__global DATA_TYPE *)in2.ptr), int16); in_a = in_a - (int16)((int)OFFSET_IN1); in_b = in_b - (int16)((int)OFFSET_IN2); diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/convert_fc_weights.cl index d47b733acd..db0873755e 100644 --- a/src/core/CL/cl_kernels/convert_fc_weights.cl +++ b/src/core/CL/cl_kernels/convert_fc_weights.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -32,7 +32,7 @@ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float * @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128 * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8, S8, QASYMM8, U16, S16, U32, S32, F16, F32 + * @param[in] src_ptr Pointer to the source image. Supported data types: All. * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl index ea2455c613..a9a6ac1947 100644 --- a/src/core/CL/cl_kernels/deconvolution_layer.cl +++ b/src/core/CL/cl_kernels/deconvolution_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,7 @@ /** This function applies upsample on an input image. * - * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8/F16/F32 + * @param[in] src_ptr Pointer to the source image. Supported data types: All. * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/depth_to_space.cl index 2ffd0a40e7..5c2e8a1d57 100644 --- a/src/core/CL/cl_kernels/depth_to_space.cl +++ b/src/core/CL/cl_kernels/depth_to_space.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -30,7 +30,7 @@ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All. * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -72,7 +72,7 @@ __kernel void depth_to_space_nchw( * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All. * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl index 7550b4ba76..add86e3f2e 100644 --- a/src/core/CL/cl_kernels/dequantization_layer.cl +++ b/src/core/CL/cl_kernels/dequantization_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -33,7 +33,7 @@ * @note Quantization scale of input tensor is passed in with -DSCALE=scale. * @note Quantization offset of input tensor is passed in with -DOFFSET=offset. * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QSYMM8 + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl index 811aa1b865..aabde4119f 100644 --- a/src/core/CL/cl_kernels/gemv.cl +++ b/src/core/CL/cl_kernels/gemv.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -110,12 +110,12 @@ __kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VEC } } } -#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */ -#if defined(SRC_WIDTH) && defined(SRC_HEIGHT) /** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor. * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uchar + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -123,13 +123,13 @@ __kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VEC * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] weights_ptr Pointer to the weights tensor. Same as @p src_ptr + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: S32 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor @@ -158,14 +158,14 @@ __kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src), // This kernel handle 4 rows in per thread so that it can reuse the weights for(int i = 0; i < SRC_WIDTH; i += 4) { - int4 w = convert_int4(vload4(0, (__global uchar *)(current_weights + i * weights_stride_x))) + (int4)weights_offset; + int4 w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset; int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y; - int4 tmp0 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s0))) + (int4)input_offset; - int4 tmp1 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s1))) + (int4)input_offset; - int4 tmp2 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s2))) + (int4)input_offset; - int4 tmp3 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s3))) + (int4)input_offset; + int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0))) + (int4)input_offset; + int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1))) + (int4)input_offset; + int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2))) + (int4)input_offset; + int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3))) + (int4)input_offset; // Accumulate acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3; @@ -197,4 +197,4 @@ __kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src), } } } -#endif /* defined(SRC_WIDTH) && defined(SRC_HEIGHT) */ +#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */ diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl index 925975d2ba..b2ba65f812 100644 --- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl +++ b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,7 +39,7 @@ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8 * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8 * - * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8 + * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) @@ -102,7 +102,7 @@ __kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src), * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8 * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8 * - * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8 + * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl index 989316d661..d277c6c56f 100644 --- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl +++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -109,7 +109,7 @@ __kernel void pixelwise_mul_int( * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * - * @param[in] in1_ptr Pointer to the source image. Supported data types: QASYMM8/QSYMM16 + * @param[in] in1_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16 * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) @@ -117,7 +117,7 @@ __kernel void pixelwise_mul_int( * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 + * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) @@ -125,7 +125,7 @@ __kernel void pixelwise_mul_int( * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32 + * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl index c8b5e07b47..207669e43e 100644 --- a/src/core/CL/cl_kernels/pooling_layer.cl +++ b/src/core/CL/cl_kernels/pooling_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -391,28 +391,16 @@ __kernel void pooling_layer_optimized_3( #if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) -// Set the initial value for the pooling operation accordingly with the data type -#if defined(POOL_AVG) || defined(POOL_L2) -#define INITIAL_VALUE 0 -#else /* defined(POOL_AVG) || defined(POOL_L2) */ -#if FP16 -#define INITIAL_VALUE -HALF_MAX -#else // FP16 -#define INITIAL_VALUE -FLT_MAX -#endif // FP16 - -#endif // POOL_AVG - /** Performs a pooling function of pool size equal to N (NCHW) * * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; - * @note -DFP16 must be passed at compile time if half float data type is used * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; * @note In case of average pooling the following information must be passed at compile time: * -DPOOL_AVG must be provided otherwise max pooling will be performed. * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) @@ -519,13 +507,13 @@ ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_siz /** Performs a pooling function of pool size equal to N (NHWC) * * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32 - * @note -DFP16 must be passed at compile time if half float data type is used * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * @note In case of average pooling the following information must be passed at compile time: * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 * * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl index 2df22d736c..3a370eea93 100644 --- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl +++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -23,18 +23,19 @@ */ #include "helpers.h" +#if defined(DATA_TYPE) && defined(INITIAL_VALUE) +#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) -#define VEC_FLOAT(VEC_SIZE) \ - VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE) #define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE) -#define VEC_UCHAR(VEC_SIZE) VEC_DATA_TYPE(uchar, VEC_SIZE) #define CONVERT_RTE(x, type) (convert_##type##_rte((x))) #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) #define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res) \ { \ const VEC_FLOAT(VEC_SIZE) in_f32 = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \ const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset)); \ - res = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_UCHAR(VEC_SIZE)); \ + res = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_TYPE(VEC_SIZE)); \ } #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ @@ -74,8 +75,10 @@ int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 * - * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8 + * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) @@ -100,8 +103,8 @@ __kernel void pooling_layer_MxN_quantized_nchw( Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - int8 vdata = 0; - int sdata = 0; + int8 vdata = INITIAL_VALUE; + int sdata = INITIAL_VALUE; // Load data for(int y = 0; y < POOL_SIZE_Y; y++) @@ -109,17 +112,18 @@ __kernel void pooling_layer_MxN_quantized_nchw( int x = 0; for(; x <= ((int)POOL_SIZE_X - 8); x += 8) { - uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, x, y, 0)); - int8 data0 = convert_int8(data); - vdata = POOL_OP(vdata, data0); + VEC_TYPE(8) + data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); + int8 data0 = convert_int8(data); + vdata = POOL_OP(vdata, data0); } // Leftover for(; x < (int)POOL_SIZE_X; ++x) { - uchar data = *((__global uchar *)tensor3D_offset(&input, x, y, 0)); - int data0 = convert_int(data); - sdata = POOL_OP(sdata, data0); + DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); + int data0 = convert_int(data); + sdata = POOL_OP(sdata, data0); } } @@ -133,22 +137,22 @@ __kernel void pooling_layer_MxN_quantized_nchw( res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))); #endif /* defined(POOL_AVG) */ - uchar result_u8 = convert_uchar(res); + DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE); #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) - const float result_f32 = convert_float(result_u8); + const float result_f32 = convert_float(result_q8); const float input_offset = (float)OFFSET_IN1; const float input_scale = (float)SCALE_IN1; const float scale_out = (float)SCALE_OUT; const float offset_out = (float)OFFSET_OUT; const float in_f32 = (result_f32 - input_offset) * input_scale; const float out_f32 = in_f32 / scale_out + offset_out; - result_u8 = convert_uchar_sat(convert_int_rte(out_f32)); + result_q8 = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE); #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ - *(__global uchar *)output.ptr = result_u8; + *(__global DATA_TYPE *)output.ptr = result_q8; } int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h, @@ -158,7 +162,7 @@ int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int u #if defined(DST_DEPTH) int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y; #else /* defined(DST_DEPTH) */ - int start_y = get_global_id(2) * stride_y - pad_y; + int start_y = get_global_id(2) * stride_y - pad_y; #endif /* defined(DST_DEPTH) */ const int end_x = min(start_x + pool_size_x, upper_bound_w); @@ -178,8 +182,9 @@ int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int u * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * @note In case of average pooling the following information must be passed at compile time: * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 * - * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8 + * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) @@ -209,17 +214,17 @@ __kernel void pooling_layer_MxN_quantized_nhwc( Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH); Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH); #else /* defined(DST_DEPTH) */ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); #endif /* defined(DST_DEPTH) */ - int8 vdata = 0; + int8 vdata = INITIAL_VALUE; const int idx_width = get_global_id(1) * STRIDE_X; #if defined(DST_DEPTH) const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y; #else /* defined(DST_DEPTH) */ - const int idx_height = get_global_id(2) * STRIDE_Y; + const int idx_height = get_global_id(2) * STRIDE_Y; #endif /* defined(DST_DEPTH) */ for(int y = 0; y < POOL_SIZE_Y; ++y) @@ -231,9 +236,11 @@ __kernel void pooling_layer_MxN_quantized_nhwc( x1 = select(x1, PAD_X - idx_width - 1, y != y1); #if defined(DST_DEPTH) - uchar8 data = vload8(0, (__global uchar *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0)); + VEC_TYPE(8) + data = vload8(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0)); #else /* defined(DST_DEPTH) */ - uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y)); + VEC_TYPE(8) + data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y)); #endif /* defined(DST_DEPTH) */ int8 data0 = convert_int8(data); @@ -246,11 +253,13 @@ __kernel void pooling_layer_MxN_quantized_nhwc( vdata = convert_int8(round(DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)))); #endif /* defined(POOL_AVG) */ - uchar8 out_u8 = convert_uchar8(vdata); + VEC_TYPE(8) + out_q8 = CONVERT(vdata, VEC_TYPE(8)); #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) - REQUANTIZE(8, out_u8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_u8); + REQUANTIZE(8, out_q8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q8); #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ // Store result - vstore8(out_u8, 0, (__global uchar *)output.ptr); + vstore8(out_q8, 0, (__global DATA_TYPE *)output.ptr); } +#endif /* defined(DATA_TYPE) && defined(INITIAL_VALUE) */ |