From 5e549fa8ff058eb06cb74de43b9f89a08d0b4a9c Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Wed, 16 Mar 2022 11:14:06 +0000 Subject: Add CLPool3d Int8 Support - Adds Qasymm8 and Qasymm8_signed support to the 3d pool operator Resolves: COMPMID-4669 Signed-off-by: Mohammed Suhail Munshi Change-Id: I36038c2b7c4f36baf67f7aae801356890e104538 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/410496 Tested-by: bsgcomp Reviewed-by: Sheri Zhang Comments-Addressed: bsgcomp Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7391 Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins --- Android.bp | 1 + SConscript | 1 + .../runtime/CL/functions/CLPooling3dLayer.h | 8 +- docs/user_guide/operator_list.dox | 4 +- src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl | 11 +- .../cl_kernels/nhwc/pooling_3d_layer_quantized.cl | 185 +++++++++++++++++++++ src/gpu/cl/ClKernelLibrary.cpp | 5 + src/gpu/cl/kernels/ClPool3dKernel.cpp | 55 ++++-- src/gpu/cl/kernels/ClPool3dKernel.h | 2 +- tests/validation/CL/Pooling3dLayer.cpp | 74 ++++++++- tests/validation/fixtures/Pooling3dLayerFixture.h | 13 ++ 11 files changed, 334 insertions(+), 25 deletions(-) create mode 100644 src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl diff --git a/Android.bp b/Android.bp index 691e46e5ac..c19d0d4672 100644 --- a/Android.bp +++ b/Android.bp @@ -110,6 +110,7 @@ opencl_srcs = [ "src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl", "src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl", "src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl", + "src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl", "src/core/CL/cl_kernels/nhwc/pooling_layer.cl", "src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl", "src/core/CL/cl_kernels/nhwc/reorg_layer.cl", diff --git a/SConscript b/SConscript index 3e5fa2f449..620e81199e 100644 --- a/SConscript +++ b/SConscript @@ -446,6 +446,7 @@ if env['opencl'] and env['embed_kernels']: 'src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl', 'src/core/CL/cl_kernels/nhwc/pooling_layer.cl', 'src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl', + 'src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl', 'src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl', 'src/core/CL/cl_kernels/nhwc/reorg_layer.cl', 'src/core/CL/cl_kernels/nhwc/scale.cl', diff --git a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h index 8bad449c6f..91c46770da 100644 --- a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h +++ b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h @@ -62,13 +62,15 @@ public: * |:--------------|:--------------| * |F16 |F16 | * |F32 |F32 | + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | * * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise * Cases where pooling region is completely outside input tensor are not supported * * @note Asymmetric padding is not supported when dimension rounding type == CEIL. * - * @param[in,out] input Source tensor. Data types supported: F16/F32. + * @param[in,out] input Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains 3d pooling operation information described in @ref Pooling3dLayerInfo. */ @@ -76,14 +78,14 @@ public: /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. - * @param[in,out] input Source tensor. Data types supported: F16/F32. + * @param[in,out] input Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains 3d pooling operation information described in @ref Pooling3dLayerInfo. */ void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CLPooling3dLayer * - * @param[in] input Source tensor info. Data types supported: F16/F32. + * @param[in] input Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. * @param[in] output Destination tensor info. Data types supported: Same as @p input. * @param[in] pool_info Contains 3d pooling operation information described in @ref Pooling3dLayerInfo. * diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox index c0888f1775..51a72bf2d7 100644 --- a/docs/user_guide/operator_list.dox +++ b/docs/user_guide/operator_list.dox @@ -2330,6 +2330,8 @@ where N = batches, C = channels, H = height, W = width, D = depth srcdst F16F16 F32F32 + QASYMM8QASYMM8 + QASYMM8_SIGNEDQASYMM8_SIGNED PReluLayer @@ -3155,4 +3157,4 @@ where N = batches, C = channels, H = height, W = width, D = depth */ -} // namespace \ No newline at end of file +} // namespace diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl index 7c6414312f..4e5481d1db 100644 --- a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl +++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer.cl @@ -177,8 +177,15 @@ __kernel void pooling_3d_layer_MxN_ndhwc( res0 = SQRT_OP(res0); #endif // defined(POOL_L2) - // Store result -#if defined(FP_MIXED_PRECISION) + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); + + + + // Store result +#if defined(QUANTIZED) + STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); +#elif defined(FP_MIXED_PRECISION) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl new file mode 100644 index 0000000000..abf0db9d07 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "tile_helpers.h" // Needed for GET_SPATIAL_IDX() + +#if defined(POOL_AVG) +#define POOL_OP(x, y) ((x) + (y)) +#else /* defined(POOL_AVG) */ +#define POOL_OP(x, y) (max((x), (y))) +#endif /* defined(POOL_AVG) */ + +#define SQRT_OP(x) sqrt((x)) + +#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) + +#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z) + +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) +#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE) +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) +#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res) \ + { \ + const VEC_FLOAT(VEC_SIZE) in_f32 = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \ + const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset)); \ + res = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \ + } +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ + +#if defined(POOL_L2) +#error "L2 pooling is not supported" +#endif /* defined(POOL_L2) */ + +/** Performs 3d pooling layer of size equal to MxNXD. This OpenCL kernel can perform the following pooling types: + * -# max, -DPOOL_MAX must be passed at compile time + * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be excluded, -DEXCLUDE_PADDING should be passed at compile time + * + * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are QASYMM8_SIGNED, QASYMM8 + * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float + * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result + * @note Pool size must be passed at compile time using -DPOOL_SIZE_X, -DPOOL_SIZE_Y, and -DPOOL_SIZE_Z. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4, -DPOOL_SIZE_Z=2 + * @note Input tensor width, height and depth must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT, and -DSRC_DEPTH + * @note Output tensor height, channels, depth, and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS, -DDST_DEPTH, and -DDST_BATCH_SIZE + * @note Pool strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z which are the steps of the window along the x, y and z directions + * @note Pool pads must be passed at compile time using -DPAD_X, -DPAD_Y, -DPAD_Z + * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED, QASYMM8 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_stride_v Stride of the source tensor in V dimension (in bytes) + * @param[in] input_step_v input_stride_v * number of elements along V processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_stride_v Stride of the destination tensor in V dimension (in bytes) + * @param[in] output_step_v output_stride_v * number of elements along V processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void pooling_3d_layer_MxN_ndhwc_quantized( + TENSOR5D_DECLARATION(input), + TENSOR5D_DECLARATION(output)) +{ + // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0 + // Note: If C is less than VEC_SIZE, VEC_SIZE should be shrunk to the closest smaller VEC_SIZE. This operation is performed on the host side + int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); + int idx_out_w = GET_SPATIAL_IDX(1, 1, 0); + + // The depth size dimension and the batch size dimension are collapsed over the height dimension + int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT; + int idx_out_d = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) % DST_DEPTH; + int idx_out_n = (GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT) / DST_DEPTH; + + __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_v; + + __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_d * + output_stride_w + idx_out_n * output_stride_v; + + VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + res0 = INITIAL_VALUE; + + int idx_in_w = idx_out_w * STRIDE_X - (int)PAD_X; + int idx_in_h = idx_out_h * STRIDE_Y - (int)PAD_Y; + int idx_in_d = idx_out_d * STRIDE_Z - (int)PAD_Z; + + // The start of width to consider in calculation should exclude padding + int pool_x_s = max((int)0, -idx_in_w); + // Assumed Symmetric Padding (left padding = right padding = PAD_X), the filter end should be either the pool width or what is remaining from current pos to the (src width + pad right) + int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH + PAD_X - idx_in_w); + int pool_y_s = max((int)0, -idx_in_h); + int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT + PAD_Y - idx_in_h); + int pool_z_s = max((int)0, -idx_in_d); + int pool_z_e = min((int)POOL_SIZE_Z, (int)SRC_DEPTH + PAD_Z - idx_in_d); + +#if defined(POOL_AVG) && defined(EXCLUDE_PADDING) + int filter_size = 0; +#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING) + int filter_size = pool_z_e * pool_y_e * pool_x_e; +#endif // defined(POOL_AVG) && !defined(EXCLUDE_PADDING) + + // The end of width to consider in calculation should exclude PAD_X + pool_x_e = min(pool_x_e, SRC_WIDTH - idx_in_w); + pool_y_e = min(pool_y_e, SRC_HEIGHT - idx_in_h); + pool_z_e = min(pool_z_e, SRC_DEPTH - idx_in_d); + + for(int z = pool_z_s; z < pool_z_e; ++z) + { + int depth_offset_src = (z + idx_in_d) * input_stride_w; + for(int y = pool_y_s; y < pool_y_e; ++y) + { + int height_offset_src = (y + idx_in_h) * input_stride_z; +#pragma unroll 8 + for(int x = pool_x_s; x < pool_x_e; ++x) + { + int width_offset_src = (x + idx_in_w) * input_stride_y; + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data; + VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + data0; + + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + width_offset_src + height_offset_src + depth_offset_src)); + data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); + + res0 = POOL_OP(res0, data0); + +#if defined(POOL_AVG) && defined(EXCLUDE_PADDING) + filter_size++; +#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING) + } + } + } + +#if defined(POOL_AVG) + res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size; +#endif // defined(POOL_AVG) + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); + +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) + REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0); +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ + + STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); +} +#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z) +#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp index f675dbe6fd..1bf7f2b3ac 100644 --- a/src/gpu/cl/ClKernelLibrary.cpp +++ b/src/gpu/cl/ClKernelLibrary.cpp @@ -438,6 +438,7 @@ const std::map ClKernelLibrary::_kernel_program_map = { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" }, { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" }, { "pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl" }, + { "pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl" }, { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" }, { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" }, { "scale_bilinear_nhwc", "nhwc/scale.cl" }, @@ -905,6 +906,10 @@ const std::map ClKernelLibrary::_program_source_map = { "nhwc/pooling_3d_layer.cl", #include "./cl_kernels/nhwc/pooling_3d_layer.clembed" + }, + { + "nhwc/pooling_3d_layer_quantized.cl", +#include "./cl_kernels/nhwc/pooling_3d_layer_quantized.clembed" }, { "nhwc/pooling_layer_quantized.cl", diff --git a/src/gpu/cl/kernels/ClPool3dKernel.cpp b/src/gpu/cl/kernels/ClPool3dKernel.cpp index 929ccf7cb6..a090ac5774 100644 --- a/src/gpu/cl/kernels/ClPool3dKernel.cpp +++ b/src/gpu/cl/kernels/ClPool3dKernel.cpp @@ -49,19 +49,22 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0), "Strides cannot be zero."); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding + && (pool_info.pool_type == PoolingType::AVG)), + "Exclude padding is unsupported for non-float types for Avg op"); - const auto data_layout = src->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH); - const bool is_global_pooling = pool_info.is_global_pooling; + const auto data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH); + const bool is_global_pooling = pool_info.is_global_pooling; const unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; const unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; const unsigned int pool_size_z = is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth; - int output_width = 0; - int output_height = 0; - int output_depth = 0; + int output_width = 0; + int output_height = 0; + int output_depth = 0; bool round_type_ceil_with_asymm_padding = (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding)); ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding, "Cannot use dimension round type CEIL when padding is asymmetric."); @@ -143,10 +146,31 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(idx_depth))); + // If datatype is quantized add relevant parameters + if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info()) + { + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); + } + // Set the initial value for the pooling operation accordingly with the data type if(pool_type == PoolingType::MAX) { - build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits::lowest())); + if(is_data_type_quantized(data_type)) + { + PixelValue type_min{}; + std::tie(type_min, std::ignore) = get_min_max(data_type); + build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get())); + } + else + { + build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits::lowest())); + } } else { @@ -164,6 +188,11 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT { acc_data_type = DataType::F32; } + else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division + { + acc_data_type = DataType::S32; + } + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type)); build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION"); build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); @@ -172,8 +201,10 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel))); build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size))); build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); - std::string kernel_name = "pooling_3d_layer_MxN_ndhwc"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // if datatype is quantized use quantized kernel function + std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized" : "pooling_3d_layer_MxN_ndhwc"); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration)); diff --git a/src/gpu/cl/kernels/ClPool3dKernel.h b/src/gpu/cl/kernels/ClPool3dKernel.h index 30c76ed632..00852349e6 100644 --- a/src/gpu/cl/kernels/ClPool3dKernel.h +++ b/src/gpu/cl/kernels/ClPool3dKernel.h @@ -46,7 +46,7 @@ public: * @note Asymmetric padding is not supported when dimension rounding type == CEIL. * * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[in] src Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED * @param[out] dst Destination tensor info. Data types supported: same as @p src. * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo. */ diff --git a/tests/validation/CL/Pooling3dLayer.cpp b/tests/validation/CL/Pooling3dLayer.cpp index 5c80351da2..84d630e6cf 100644 --- a/tests/validation/CL/Pooling3dLayer.cpp +++ b/tests/validation/CL/Pooling3dLayer.cpp @@ -59,10 +59,18 @@ const auto Pooling3dLayerDatasetFPSmall = combine(combine(combine(combine(datase framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })), framework::dataset::make("ExcludePadding", { true, false })); +const auto Pooling3DLayerDatasetQuantized = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), + framework::dataset::make("PoolingSize", { Size3D(2, 3, 2) })), + framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(1, 1, 2), Size3D(2, 2, 1)})), + framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })), + framework::dataset::make("ExcludePadding", { true })); + using ShapeDataset = framework::dataset::ContainerDataset>; -constexpr AbsoluteTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */ -constexpr AbsoluteTolerance tolerance_f16(0.1f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */ +constexpr AbsoluteTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */ +constexpr AbsoluteTolerance tolerance_f16(0.1f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */ +constexpr AbsoluteTolerance tolerance_qasymm8_signed(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8_SIGNED integer datatype*/ +constexpr AbsoluteTolerance tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric type */ } // namespace @@ -133,8 +141,64 @@ using CLSpecialPooling3dLayerFixture = SpecialPooling3dLayerValidationFixture using CLPooling3dLayerGlobalFixture = Pooling3dLayerGlobalValidationFixture; +template +using CLPooling3dLayerQuantizedFixture = Pooling3dLayerValidationQuantizedFixture; + // clang-format on // *INDENT-ON* +TEST_SUITE(QUANTIZED) + +TEST_SUITE(QASYMM8) +// Small Dataset Quantized Dataset +FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(), + combine(Pooling3DLayerDatasetQuantized, + framework::dataset::make("DataType", DataType::QASYMM8))), + framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, 10), QuantizationInfo(1.f / 127.f, 10) })), + framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, 5), QuantizationInfo(1.f / 127.f, 10) }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} + +// Large Dataset Quantized Dataset +FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large5dShapes(), + combine(Pooling3DLayerDatasetQuantized, + framework::dataset::make("DataType", DataType::QASYMM8))), + framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, 10), QuantizationInfo(1.f / 127.f, 10) })), + framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, 5), QuantizationInfo(1.f / 127.f, 10) }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() + +TEST_SUITE(QASYMM8_SIGNED) + +// Large Dataset Quantized Dataset Signed +FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(), + combine(Pooling3DLayerDatasetQuantized, + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))), + framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10), QuantizationInfo(1.f / 127.f, -10) })), + framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -5), QuantizationInfo(1.f / 127.f, -10) }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed); +} + +// Large Dataset Quantized pooling test +FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large5dShapes(), + combine(Pooling3DLayerDatasetQuantized, + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))), + framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10), QuantizationInfo(1.f / 127.f, -10) })), + framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -5), QuantizationInfo(1.f / 127.f, -10) }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed); +} + +TEST_SUITE_END() +TEST_SUITE_END() + TEST_SUITE(Float) TEST_SUITE(FP32) @@ -152,8 +216,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerFixture, framework::Data } FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFP, - framework::dataset::make("DataType", - DataType::F32)))) + framework::dataset::make("DataType", DataType::F32)))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); @@ -219,8 +282,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPooling3dLayerFixture, framework::Datas } FIXTURE_DATA_TEST_CASE(RunLarge, CLPooling3dLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFP, - framework::dataset::make("DataType", - DataType::F16)))) + framework::dataset::make("DataType", DataType::F16)))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f16); diff --git a/tests/validation/fixtures/Pooling3dLayerFixture.h b/tests/validation/fixtures/Pooling3dLayerFixture.h index 563f1dcced..9ba7bab53a 100644 --- a/tests/validation/fixtures/Pooling3dLayerFixture.h +++ b/tests/validation/fixtures/Pooling3dLayerFixture.h @@ -163,6 +163,19 @@ public: } }; +template +class Pooling3dLayerValidationQuantizedFixture : public Pooling3dLayerValidationGenericFixture +{ +public: + template + void setup(TensorShape shape, PoolingType pool_type, Size3D pool_size, Size3D stride, Padding3D padding, bool exclude_padding, DataType data_type, + QuantizationInfo input_qinfo, QuantizationInfo output_qinfo) + { + Pooling3dLayerValidationGenericFixture::setup(shape, Pooling3dLayerInfo(pool_type, pool_size, stride, padding, exclude_padding), + data_type, input_qinfo, output_qinfo); + } +}; + } // namespace validation } // namespace test } // namespace arm_compute -- cgit v1.2.1