From 5b52fe3a4481769adcf42218a3747486cb4e9c14 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 12 Jul 2018 12:42:35 +0100 Subject: COMPMID-1390: OCLGrind and benchmark tests fail for QASYMM8 COMPMID-1392: OCLGrind failures in im2col1x1_stridex1_dchw COMPMID-1395: OCLGrind failures in output_stage_quantized Change-Id: I35504bd1f701316df122be52d458c71bbd7e7909 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/139722 Tested-by: Jenkins Reviewed-by: Giorgio Arena Reviewed-by: Anthony Barbier --- .../direct_convolution_1x1_3x3_5x5_quantized.cl | 24 +++++++++--- .../CLDirectConvolutionOutputStageKernel.cpp | 44 ++++++++-------------- src/core/CL/kernels/CLIm2ColKernel.cpp | 2 +- tests/benchmark/fixtures/ConvolutionLayerFixture.h | 10 ++--- 4 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl index ae87420774..83da76785b 100644 --- a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl +++ b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl @@ -248,6 +248,12 @@ __kernel void direct_convolution_1x1_3x3_5x5_quantized( } #endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) +#if defined(VEC_SIZE) + +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define CONVERT_SAT_UCHAR_STR(x, size) (convert_uchar##size##_sat((x))) +#define CONVERT_SAT_UCHAR(x, size) CONVERT_SAT_UCHAR_STR(x, size) + /** This function computes the output stage of a depthwise convolution. * * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8 @@ -274,7 +280,6 @@ __kernel void direct_convolution_1x1_3x3_5x5_quantized( * @param[in] output_multiplier Output scale multiplier * @param[in] output_shift Output scale divisor exponent */ - __kernel void output_stage_quantized( TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), @@ -292,22 +297,29 @@ __kernel void output_stage_quantized( #endif //defined(HAS_BIAS) // Load input - int16 vals = vload16(0, (__global int *)(src.ptr)); + VEC_INT vals = VLOAD(VEC_SIZE)(0, (__global int *)(src.ptr)); #if defined(HAS_BIAS) // Load and add bias #if defined(NCHW) int bias_value = *((__global int *)(vector_offset(&bias, get_global_id(2)))); #else // defined(NCHW) - int16 bias_value = vload16(0, ((__global int *)(vector_offset(&bias, get_global_id(0) * 16)))); + VEC_INT bias_value = VLOAD(VEC_SIZE)(0, ((__global int *)(vector_offset(&bias, get_global_id(0) * VEC_SIZE)))); #endif // defined(NCHW) - vals += (int16)(bias_value); + vals += (VEC_INT)(bias_value); #endif //defined(HAS_BIAS) - vals = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(vals, output_multiplier, output_shift, 16); + vals = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(vals, output_multiplier, output_shift, VEC_SIZE); vals = vals + output_offset; // Store result in dst - vstore16(convert_uchar16_sat(vals), 0, (__global uchar *)dst.ptr); + VSTORE(VEC_SIZE) + (CONVERT_SAT_UCHAR(vals, VEC_SIZE), 0, (__global uchar *)dst.ptr); } + +#undef VEC_INT +#undef CONVERT_SAT_UCHAR_STR +#undef CONVERT_SAT_UCHAR + +#endif // defined(VEC_SIZE) diff --git a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp index 3d9d520841..4e2352cf6e 100644 --- a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp +++ b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp @@ -90,44 +90,29 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen bool window_changed = false; unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type()); - // Update processed elements when input is S32 (comes from quantization input) - if(input->data_type() == DataType::S32) - { - num_elems_processed_per_iteration = 16; - } - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + + // Input window AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, input_access); + + // Bias window + if(bias != nullptr) + { + AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->dimension(1)); + window_changed = window_changed || update_window_and_padding(win, bias_access); + } + // Output window if(output != nullptr && (output->total_size() != 0)) { AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - if(bias == nullptr) - { - window_changed = update_window_and_padding(win, input_access, output_access); - } - else - { - AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1)); - window_changed = update_window_and_padding(win, input_access, output_access, bias_access); - } - + window_changed = window_changed || update_window_and_padding(win, output_access); output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); } else { - if(bias == nullptr) - { - window_changed = update_window_and_padding(win, input_access); - } - else - { - AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1)); - window_changed = update_window_and_padding(win, input_access, bias_access); - } - input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape())); } @@ -165,10 +150,13 @@ void CLDirectConvolutionLayerOutputStageKernel::configure(ICLTensor *input, cons _result_shift = result_shift; _result_offset_after_shift = result_offset_after_shift; + const unsigned int num_elems_accessed_per_iteration = 16 / element_size_from_data_type(input->info()->data_type()); + // Create kernel CLBuildOptions build_opts; build_opts.add_option_if(bias != nullptr, "-DHAS_BIAS"); build_opts.add_option("-D" + string_from_data_layout(input->info()->data_layout())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration)); _kernel = static_cast(CLKernelLibrary::get().create_kernel("output_stage_quantized", build_opts.options())); // Set static kernel arguments diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp index 06ca005dd5..b1290b8edd 100644 --- a/src/core/CL/kernels/CLIm2ColKernel.cpp +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -262,7 +262,7 @@ CLIm2ColKernel::configure_window(const ICLTensor *input, ICLTensor *output, cons AccessWindowStatic input_access(input->info(), -border.left, -border.top, - ceil_to_multiple(input_width + border.right, kernel_dims.width), + ceil_to_multiple(input_width + border.right, kernel_dims.width * _num_elems_processed_per_iteration), input_height + border.bottom); update_window_and_padding(win, input_access); } diff --git a/tests/benchmark/fixtures/ConvolutionLayerFixture.h b/tests/benchmark/fixtures/ConvolutionLayerFixture.h index 338a02162d..b23c3457ab 100644 --- a/tests/benchmark/fixtures/ConvolutionLayerFixture.h +++ b/tests/benchmark/fixtures/ConvolutionLayerFixture.h @@ -46,16 +46,16 @@ public: int batches) { // Set batched in source and destination shapes - src_shape.set(3 /* batch */, batches); dst_shape.set(3 /* batch */, batches); - DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type; + DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type; + const QuantizationInfo qinfo(2.f / 255.f, 127); // Create tensors - src = create_tensor(src_shape, data_type, 1); - weights = create_tensor(weights_shape, data_type, 1); + src = create_tensor(src_shape, data_type, 1, qinfo); + weights = create_tensor(weights_shape, data_type, 1, qinfo); biases = create_tensor(biases_shape, bias_data_type, 1); - dst = create_tensor(dst_shape, data_type, 1); + dst = create_tensor(dst_shape, data_type, 1, qinfo); // Create and configure function conv_layer.configure(&src, &weights, &biases, &dst, info, WeightsInfo(), dilation, act_info); -- cgit v1.2.1