From 13ec5f0a09e038f12cbe0f3b119a215934b72b42 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Thu, 2 Jan 2020 12:11:13 +0000 Subject: COMPMID-2800: Add support for QASYMM8_SIGNED in NEDepthwiseConvolutionLayer3x3Kernel Change-Id: Ia5d23ff2c9e59c80ded2fac5ca02704214f0a01a Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/2537 Comments-Addressed: Arm Jenkins Reviewed-by: Pablo Marquez Tested-by: Arm Jenkins --- .../NEDepthwiseConvolutionLayer3x3Kernel.cpp | 21 +++++++++++++-------- .../NEON/kernels/NEDirectConvolutionLayerKernel.cpp | 10 +++++----- src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp | 20 ++++++++++---------- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 2 +- .../NEDepthwiseConvolutionAssemblyDispatch.cpp | 14 +++++++++----- 5 files changed, 38 insertions(+), 29 deletions(-) (limited to 'src') diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp index e47786525e..1dd05d2cf1 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -63,7 +63,7 @@ public: const int kernel_stride_z = weights->info()->strides_in_bytes().z(); const int output_w = output->info()->dimension(0); const int output_h = output->info()->dimension(1); - const int delta_input = detail::get_input_num_elems_processed(num_elems_written_per_iteration); + const int delta_input = detail::get_input_num_elems_processed(num_elems_written_per_iteration, stridex); const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); const unsigned int conv_pad_x = conv_info.pad_left(); const unsigned int conv_pad_y = conv_info.pad_top(); @@ -107,8 +107,8 @@ public: { auto in_top = reinterpret_cast(input_ptr + (ih + 0) * input_stride_y); auto in_mid = reinterpret_cast(input_ptr + (ih + dilation.y()) * input_stride_y); - auto in_low = reinterpret_cast(input_ptr + (ih + 2 * dilation.y()) * input_stride_y); //uint8 - auto p_out = reinterpret_cast(out.ptr() + oh * output_stride_y); //int32 + auto in_low = reinterpret_cast(input_ptr + (ih + 2 * dilation.y()) * input_stride_y); // uint8/int8 + auto p_out = reinterpret_cast(out.ptr() + oh * output_stride_y); // int32 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, @@ -116,12 +116,12 @@ public: { if(dilation == Size2D(1U, 1U)) { - auto vres = detail::convolve_3x3(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset); + auto vres = detail::convolve_3x3(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, stridex, input_offset); detail::store_results(p_out, vres); } else { - auto vres = detail::convolve_3x3_dilation(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, dilation.x(), input_offset); + auto vres = detail::convolve_3x3_dilation(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, dilation.x(), stridex, input_offset); detail::store_results(p_out, vres); } } @@ -156,7 +156,7 @@ inline void convolve_3x3(const Window &window, unsigned int num_elems_written_pe Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); const DataLayout data_layout = input->data_layout(); @@ -192,7 +192,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen // Get convolved dimensions const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); - const DataType output_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type(); + const DataType output_dt = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type(); // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt).set_quantization_info(output->quantization_info())); @@ -209,6 +209,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen switch(input->data_type()) { case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: num_elems_read_per_iteration = 16 + 15 * (dilation.x() - 1); break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -263,6 +264,7 @@ void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const switch(input->info()->data_type()) { case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: case DataType::F32: _num_elems_written_per_iteration = 16 >> _conv_info.stride().first; break; @@ -307,6 +309,9 @@ void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const Threa case DataType::QASYMM8: convolve_3x3(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation); break; + case DataType::QASYMM8_SIGNED: + convolve_3x3(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation); + break; default: ARM_COMPUTE_ERROR("Not implemented"); } diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index bcf70b3ad8..4a71c1edea 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -651,7 +651,7 @@ public: const int output_w = output->info()->dimension(0); const int output_h = output->info()->dimension(1); const int num_planes_z = window.z().end() - window.z().start(); - const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration); + const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex); const int kernel_depth = weights->info()->dimension(Window::DimZ); const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); const unsigned int conv_pad_left = conv_info.pad_left(); @@ -718,7 +718,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2); + auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, stridex); store_results(p_out, vres); } } @@ -743,7 +743,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2); + auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, stridex); accumulate_results(p_out, vres); } } @@ -774,7 +774,7 @@ public: const int output_w = output->info()->dimension(0); const int output_h = output->info()->dimension(1); const int num_planes_z = window.z().end() - window.z().start(); - const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration); + const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex); const int kernel_depth = weights->info()->dimension(Window::DimZ); const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); const unsigned int conv_pad_left = conv_info.pad_left(); diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp index 72632492d7..374005d897 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -196,7 +196,7 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w auto vector_sum_row = reinterpret_cast(out.ptr()); - wrapper::vstore(vector_sum_row, wrapper::vreinterpret_s32(sum_row)); + wrapper::vstore(vector_sum_row, wrapper::vreinterpret(sum_row)); }, in, out); } @@ -352,10 +352,10 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const auto vector_sum_col = reinterpret_cast(out.ptr()); - wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0])); - wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1])); - wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2])); - wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3])); + wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); + wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); + wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); + wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); }, in, out); } @@ -467,10 +467,10 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const auto vector_sum_col = reinterpret_cast(out.ptr()); - wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0])); - wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1])); - wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2])); - wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3])); + wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); + wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); + wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); + wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); }, inb, out); } diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 0320002fba..beb024c529 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -39,7 +39,7 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); if(!is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp index 142f873ef4..e0094f4eec 100644 --- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -469,8 +469,12 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso } // Check data type - const DataType data_type = weights->data_type(); - bool is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type) || data_type == DataType::QSYMM8_PER_CHANNEL; + // TODO (COMPMID-3004): Add assembly optimized routine for QASYMM8_SIGNED NEDepthwiseConvolutionLayer + const DataType input_type = input->data_type(); + const bool is_input_type_valid = is_data_type_float(input_type) || input_type == DataType::QASYMM8; + const DataType weights_type = weights->data_type(); + const bool is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED + || weights_type == DataType::QSYMM8_PER_CHANNEL; // Check weighs size std::set supported_kernel_sizes = { 3, 5 }; @@ -496,12 +500,12 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported bool is_dilation_supported = ((dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1)); - if(data_type == DataType::QSYMM8_PER_CHANNEL) + if(weights_type == DataType::QSYMM8_PER_CHANNEL) { is_dilation_supported = is_dilation_supported && (dilation == Size2D(1U, 1U)); } - return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported; + return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported; } void NEDepthwiseConvolutionAssemblyDispatch::run() -- cgit v1.2.1