From a799ce0ad775829862891dd98d1232638ec8761e Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 12 Sep 2018 20:11:34 +0100 Subject: COMPMID-1564: Add NEDepthwiseConvolution3x3 for QASYMM8 Change-Id: I1f55508af6f220e5f41df7b56daffb4761ed0591 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/148253 Tested-by: bsgcomp Reviewed-by: Isabella Gottardi --- .../kernels/convolution/depthwise/depthwise.hpp | 17 +- .../kernels/convolution/depthwise/impl_base.hpp | 26 +- examples/graph_mobilenet.cpp | 33 ++- .../NEDepthwiseConvolutionLayer3x3Kernel.cpp | 37 ++- .../NEDirectConvolutionLayerOutputStageKernel.cpp | 104 ++++++- .../depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp | 2 +- .../depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp | 2 +- .../depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp | 2 +- .../depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp | 2 +- .../depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp | 128 +++++++++ .../depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp | 166 +++++++++++ .../convolution/depthwise/impl_fp16_fp16.hpp | 8 +- .../convolution/depthwise/impl_fp32_fp32.hpp | 8 +- .../kernels/convolution/depthwise/impl_u8_s32.hpp | 315 +++++++++++++++++++++ .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 40 +-- tests/datasets/DepthwiseConvolutionLayerDataset.h | 2 + .../validation/NEON/DepthwiseConvolutionLayer.cpp | 10 + utils/Utils.h | 4 +- 18 files changed, 846 insertions(+), 60 deletions(-) create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp index 4ca68116db..472c44f97a 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp @@ -33,6 +33,7 @@ class IDepthwiseConvolution virtual ~IDepthwiseConvolution() = default; virtual int output_size(const int dim_size, const bool padding_same) const = 0; virtual unsigned int get_window(void) const = 0; + virtual void set_offsets(int input_offset, int weights_offset) = 0; virtual void run(const unsigned int start, const unsigned int stop) = 0; }; @@ -179,6 +180,13 @@ class DepthwiseConvolution : public IDepthwiseConvolution >::get_output_size(dim_size, padding_same); } + /** Sets quantization offsets + * + * @param[in] input_offset Input offset + * @param[in] weights_offset Weights offset + */ + void set_offsets(int input_offset, int weights_offset) override; + /** Get the window of work to be performed by an instance of the operator. */ unsigned int get_window(void) const override; @@ -212,7 +220,9 @@ class DepthwiseConvolution : public IDepthwiseConvolution const int row_pad_out_bottom, const int n_tiles, const int n_input_cols, - const int n_output_cols + const int n_output_cols, + const int input_offset, + const int weights_offset ); // Determine the maximum (and minimum) padding values which can be applied @@ -272,7 +282,9 @@ class DepthwiseConvolution : public IDepthwiseConvolution const int _in_pad_bottom, const int _in_pad_right, const int _out_pad_bottom, - const int _out_pad_right + const int _out_pad_right, + const int _input_offset, + const int _weights_offset ); /* Arrays of methods to process tensor tiles. @@ -300,6 +312,7 @@ class DepthwiseConvolution : public IDepthwiseConvolution const int _weight_col_stride, _weight_row_stride; const int _input_col_stride, _input_row_stride, _input_batch_stride; const int _output_col_stride, _output_row_stride, _output_batch_stride; + int _input_offset, _weights_offset; }; } // namespace depthwise diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp index 17889849db..e262817a3c 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp @@ -82,7 +82,8 @@ DepthwiseConvolution::DepthwiseConvolution( _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride), _output_col_stride(output_col_stride ? output_col_stride : _n_channels), _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride), - _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride) + _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride), + _input_offset(0), _weights_offset(0) { } @@ -94,6 +95,12 @@ unsigned int DepthwiseConvolution::get_wind return iceildiv(_n_channels, CHANNEL_BLOCK); } +template +void DepthwiseConvolution::set_offsets(int input_offset, int weights_offset) +{ + _input_offset = input_offset; + _weights_offset = weights_offset; +} template void DepthwiseConvolution::run( @@ -145,7 +152,8 @@ void DepthwiseConvolution::run( outptr_row + start_channel, _output_row_stride, _output_col_stride, input_row_pad_top, input_pad_left, input_row_pad_bottom, output_row_pad_bottom, - _n_tile_cols, _n_input_cols, _n_output_cols + _n_tile_cols, _n_input_cols, _n_output_cols, + _input_offset, _weights_offset ); } } @@ -170,7 +178,9 @@ void DepthwiseConvolution::process_tile_row const int row_pad_out_bottom, const int n_tiles, const int n_input_cols, - const int n_output_cols + const int n_output_cols, + const int input_offset, + const int weights_offset ) { constexpr int tile_overlap = kernel_cols - stride_cols; @@ -242,7 +252,7 @@ void DepthwiseConvolution::process_tile_row inptr_col, in_row_stride, in_col_stride, outptr_col, out_row_stride, out_col_stride, row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, - row_pad_out_bottom, t_pad_out_right + row_pad_out_bottom, t_pad_out_right, input_offset, weights_offset ); } } @@ -313,7 +323,9 @@ struct DepthwiseConvolutionImpl : public DepthwiseConvolution< const int in_pad_bottom=0, const int in_pad_right=0, const int out_pad_bottom=0, - const int out_pad_right=0 + const int out_pad_right=0, + const int input_offset=0, + const int weights_offset=0 ); }; @@ -340,7 +352,9 @@ void DepthwiseConvolutionImpl::process_tile const int _in_pad_bottom, const int _in_pad_right, const int _out_pad_bottom, - const int _out_pad_right + const int _out_pad_right, + const int _input_offset, + const int _weights_offset ) { constexpr auto inner_tile_rows = DWC::inner_tile_rows; diff --git a/examples/graph_mobilenet.cpp b/examples/graph_mobilenet.cpp index 35ab224700..ab6a4a842f 100644 --- a/examples/graph_mobilenet.cpp +++ b/examples/graph_mobilenet.cpp @@ -183,6 +183,12 @@ private: // Get trainable parameters data path std::string data_path = common_params.data_path; + // Add model path to data path + if(!data_path.empty()) + { + data_path += "/cnn_data/mobilenet_qasymm8_model/"; + } + // Quantization info taken from the AndroidNN QASYMM8 MobileNet example const QuantizationInfo in_quant_info = QuantizationInfo(0.0078125f, 128); const QuantizationInfo mid_quant_info = QuantizationInfo(0.0784313753247f, 128); @@ -228,14 +234,15 @@ private: }; graph << InputLayer(input_descriptor.set_quantization_info(in_quant_info), - get_weights_accessor(data_path, "/cnn_data/mobilenet_qasymm8_model/" + common_params.image)) + get_weights_accessor(data_path, common_params.image)) << ConvolutionLayer( 3U, 3U, 32U, - get_weights_accessor(data_path, "/cnn_data/mobilenet_qasymm8_model/Conv2d_0_weights.npy"), - get_weights_accessor(data_path, "/cnn_data/mobilenet_qasymm8_model/Conv2d_0_bias.npy"), + get_weights_accessor(data_path, "Conv2d_0_weights.npy"), + get_weights_accessor(data_path, "Conv2d_0_bias.npy"), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), 1, conv_weights_quant_info.at(0), mid_quant_info) - << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)); + .set_name("Conv2d_0") + << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name("Conv2d_0/Relu6"); graph << get_dwsc_node_qasymm(data_path, "Conv2d_1", 64U, PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(0), point_weights_quant_info.at(0)); graph << get_dwsc_node_qasymm(data_path, "Conv2d_2", 128U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(1), point_weights_quant_info.at(1)); @@ -261,12 +268,14 @@ private: point_weights_quant_info.at(11)); graph << get_dwsc_node_qasymm(data_path, "Conv2d_13", 1024U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(12), point_weights_quant_info.at(12)) - << PoolingLayer(PoolingLayerInfo(PoolingType::AVG)) + << PoolingLayer(PoolingLayerInfo(PoolingType::AVG)).set_name("Logits/AvgPool_1a") << ConvolutionLayer( 1U, 1U, 1001U, - get_weights_accessor(data_path, "/cnn_data/mobilenet_qasymm8_model/Logits_Conv2d_1c_1x1_weights.npy"), - get_weights_accessor(data_path, "/cnn_data/mobilenet_qasymm8_model/Logits_Conv2d_1c_1x1_bias.npy"), - PadStrideInfo(1U, 1U, 0U, 0U), 1, conv_weights_quant_info.at(1)); + get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy"), + get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_bias.npy"), + PadStrideInfo(1U, 1U, 0U, 0U), 1, conv_weights_quant_info.at(1)) + .set_name("Logits/Conv2d_1c_1x1"); + ; } ConcatLayer get_dwsc_node_float(const std::string &data_path, std::string &¶m_path, @@ -312,7 +321,7 @@ private: PadStrideInfo dwc_pad_stride_info, PadStrideInfo conv_pad_stride_info, QuantizationInfo depth_weights_quant_info, QuantizationInfo point_weights_quant_info) { - std::string total_path = "/cnn_data/mobilenet_qasymm8_model/" + param_path + "_"; + std::string total_path = param_path + "_"; SubStream sg(graph); sg << DepthwiseConvolutionLayer( @@ -320,13 +329,15 @@ private: get_weights_accessor(data_path, total_path + "depthwise_weights.npy"), get_weights_accessor(data_path, total_path + "depthwise_bias.npy"), dwc_pad_stride_info, depth_weights_quant_info) - << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)) + .set_name(total_path + "depthwise/depthwise") + << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(total_path + "depthwise/Relu6") << ConvolutionLayer( 1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "pointwise_weights.npy"), get_weights_accessor(data_path, total_path + "pointwise_bias.npy"), conv_pad_stride_info, 1, point_weights_quant_info) - << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)); + .set_name(total_path + "pointwise/Conv2D") + << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(total_path + "pointwise/Relu6"); return ConcatLayer(std::move(sg)); } diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp index 7029b06615..99bdb7a70e 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp @@ -198,8 +198,10 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen output_shape.set(1, convolver->output_size(output_shape.y(), same_padding)); // Set width output_shape.set(2, convolver->output_size(output_shape.z(), same_padding)); // Set height + const DataType output_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type(); + // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape)); + auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt)); // Configure window (optimised) // Set padding in channels @@ -324,7 +326,7 @@ bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(Tenso } // Check supported data type - bool supported_datatype = is_data_type_float(dt); + bool supported_datatype = is_data_type_float(dt) || is_data_type_quantized(dt); // Check for supported strides const auto &strides = conv_info.stride(); @@ -345,11 +347,15 @@ bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(Tenso void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver() { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights); ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3); _convolver = create_convolver_object(_conv_info, _weights, _input, _output, true); + if(_convolver) + { + _convolver->set_offsets(-_input->info()->quantization_info().offset, -_weights->info()->quantization_info().offset); + } } void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic() @@ -433,6 +439,31 @@ std::unique_ptr NEDepthwiseConvolutionLayer3x3 const auto stride_x = conv_info.stride().first; switch(dt) { + case DataType::QASYMM8: + { + switch(stride_x) + { + case 1: + return arm_compute::support::cpp14::make_unique>( + n_batches, in_rows, in_cols, n_channels, padding_same, + reinterpret_cast(w->ptr_to_element(Coordinates())), + in->ptr_to_element(Coordinates()), + reinterpret_cast(out->ptr_to_element(Coordinates())), weight_col_stride, + weight_row_stride, input_col_stride, input_row_stride, input_batch_stride, + output_col_stride, output_row_stride, output_batch_stride); + case 2: + return arm_compute::support::cpp14::make_unique>( + n_batches, in_rows, in_cols, n_channels, padding_same, + reinterpret_cast(w->ptr_to_element(Coordinates())), + in->ptr_to_element(Coordinates()), + reinterpret_cast(out->ptr_to_element(Coordinates())), weight_col_stride, + weight_row_stride, input_col_stride, input_row_stride, input_batch_stride, + output_col_stride, output_row_stride, output_batch_stride); + default: + return nullptr; + } + break; + } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp index 864c63f731..a571d54501 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp @@ -194,8 +194,8 @@ inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y) #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ template -void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN); ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); @@ -304,14 +304,14 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window internal_vst1q(out_ptr, internal_vld1q(in_ptr)); } }, - in, bi); + in, bi, out); } } // QASYMM8 specializations template <> -void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); uint8x16_t min = vdupq_n_u8(0); @@ -352,8 +352,8 @@ void output_stage(ITensor *input, const ITensor * in, out); } template <> -void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { ARM_COMPUTE_UNUSED(bias); @@ -382,6 +382,85 @@ void output_stage(ITensor *input, const ITensor }, in, out); } +template <> +void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +{ + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); + uint8x16_t min = vdupq_n_u8(0); + uint8x16_t max = vdupq_n_u8(255); + + Window window_bias = window; + window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); + window_bias.set(3, Window::Dimension(0, 0, 0)); + + Iterator in(input, window); + Iterator bi(bias, window_bias); + + Iterator out(output, window); + execute_window_loop(window, [&](const Coordinates & id) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto bias_ptr = reinterpret_cast(bi.ptr()); + + // Accumulate bias + int32x4x4_t v_in = + { + { + vaddq_s32(vld1q_s32(in_ptr), vld1q_s32(bias_ptr)), + vaddq_s32(vld1q_s32(in_ptr + 4), vld1q_s32(bias_ptr + 4)), + vaddq_s32(vld1q_s32(in_ptr + 8), vld1q_s32(bias_ptr + 8)), + vaddq_s32(vld1q_s32(in_ptr + 12), vld1q_s32(bias_ptr + 12)) + } + }; + + const auto out_ptr = out.ptr(); + vst1q_u8(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max)); + }, + in, bi, out); +} +template <> +void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +{ + ARM_COMPUTE_UNUSED(bias); + + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); + uint8x16_t min = vdupq_n_u8(0); + uint8x16_t max = vdupq_n_u8(255); + + Window window_bias = window; + window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); + window_bias.set(3, Window::Dimension(0, 0, 0)); + + Iterator in(input, window); + Iterator bi(bias, window_bias); + + Iterator out(output, window); + execute_window_loop(window, [&](const Coordinates & id) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast(in.ptr()); + + // Accumulate bias + int32x4x4_t v_in = + { + { + vld1q_s32(in_ptr), + vld1q_s32(in_ptr + 4), + vld1q_s32(in_ptr + 8), + vld1q_s32(in_ptr + 12) + } + }; + + const auto out_ptr = out.ptr(); + vst1q_u8(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max)); + }, + in, bi, out); +} } // namespace NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel() @@ -426,19 +505,19 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const { case DataType::S32: { - _func = (bias == nullptr) ? &output_stage : &output_stage; + _func = (bias == nullptr) ? &output_stage_nchw : &output_stage_nchw; break; } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { - _func = (output == nullptr) ? &output_stage : &output_stage; + _func = (output == nullptr) ? &output_stage_nchw : &output_stage_nchw; break; } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::F32: { - _func = (output == nullptr) ? &output_stage : &output_stage; + _func = (output == nullptr) ? &output_stage_nchw : &output_stage_nchw; break; } default: @@ -451,6 +530,11 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const { switch(input->info()->data_type()) { + case DataType::S32: + { + _func = (output == nullptr) ? &output_stage_nhwc : &output_stage_nhwc; + break; + } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp index c5a056560b..ca1de26ed7 100644 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp @@ -43,7 +43,7 @@ void ConvImpl::process_tile( float* const outptr, const int out_row_stride, const int out_col_stride, - const int, const int, const int, const int, const int, const int + const int, const int, const int, const int, const int, const int, const int, const int ) { // Copy pointers diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp index 0c96bebc02..21e8f04774 100644 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp @@ -43,7 +43,7 @@ void ConvImpl::process_tile( float* const outptr, const int out_row_stride, const int out_col_stride, - const int, const int, const int, const int, const int, const int + const int, const int, const int, const int, const int, const int, const int, const int ) { // Copy pointers diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp index 941c8e9248..c7113d05b3 100644 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp @@ -43,7 +43,7 @@ void ConvImpl::process_tile( float* const outptr, const int out_row_stride, const int out_col_stride, - const int, const int, const int, const int, const int, const int + const int, const int, const int, const int, const int, const int, const int, const int ) { // Copy pointers diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp index 1cbd6d5623..c36c24ec0f 100644 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp @@ -43,7 +43,7 @@ void ConvImpl::process_tile( float* const outptr, const int out_row_stride, const int out_col_stride, - const int, const int, const int, const int, const int, const int + const int, const int, const int, const int, const int, const int, const int, const int ) { constexpr auto inner_tile_rows = DWC::inner_tile_rows; diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp new file mode 100644 index 0000000000..8f22a64ea6 --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "impl_u8_s32.hpp" + +namespace depthwise +{ +using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>; +using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, uint8_t, int32_t>; + +template <> +const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile; + +template <> +const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = { + ConvImpl::template process_tile, +}; + +template <> +const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = { + ConvImpl::template process_tile, +}; + +template <> +const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = { + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, +}; + +template <> +const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = { + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, +}; + +template <> +const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile; + +template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>; +} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp new file mode 100644 index 0000000000..cf515504c7 --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "impl_u8_s32.hpp" + +namespace depthwise +{ +using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>; +using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, uint8_t, int32_t>; + +template <> +const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile; + +template <> +const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = { + ConvImpl::template process_tile, + ConvImpl::template process_tile, +}; + +template <> +const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = { + ConvImpl::template process_tile, + ConvImpl::template process_tile, +}; + +template <> +const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = { + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, +}; + +template <> +const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = { + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, + { + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + ConvImpl::template process_tile, + }, +}; + +template <> +const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile; + +template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>; +} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp index ed4cfb86b9..dacfb24c89 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp @@ -75,7 +75,9 @@ struct DepthwiseConvolutionImpl::p const int _in_pad_bottom, const int _in_pad_right, const int _out_pad_bottom, - const int _out_pad_right + const int _out_pad_right, + const int _input_offset, + const int _weights_offset ) { constexpr auto inner_tile_rows = DWC::inner_tile_rows; diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp index 7a216ed518..840086f917 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp @@ -75,7 +75,9 @@ struct DepthwiseConvolutionImpl::process_t const int _in_pad_bottom, const int _in_pad_right, const int _out_pad_bottom, - const int _out_pad_right + const int _out_pad_right, + const int _input_offset, + const int _weights_offset ) { constexpr auto inner_tile_rows = DWC::inner_tile_rows; diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp new file mode 100644 index 0000000000..d0d8de538d --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * + * NOTE: Header to be included by implementation files only. + * + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + */ + +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp" + +#pragma once + +namespace depthwise +{ +// Partial specialisation for U8 to S32 +template +struct DepthwiseConvolutionImpl +{ + typedef DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + uint8_t, int32_t + > DWC; + + template < + bool Specialize=false, // Specialize (or not) the method + int InPadTop=0, // If specialized, top padding + int InPadLeft=0, // If specialized, left padding + int InPadBottom=0, // If specialized, bottom padding + int InPadRight=0, // If specialized, right padding + int OutPadBottom=0, // If specialized, bottom output padding + int OutPadRight=0 // If specialized, bottom right padding + > + static void process_tile( + const int n_channels, + const uint8_t* const weights, + const int weight_row_stride, + const int weight_col_stride, + const uint8_t* const inptr, + const int in_row_stride, + const int in_col_stride, + int32_t* const outptr, + const int out_row_stride, + const int out_col_stride, + const int in_pad_top=0, + const int in_pad_left=0, + const int in_pad_bottom=0, + const int in_pad_right=0, + const int out_pad_bottom=0, + const int out_pad_right=0, + const int input_offset=0, + const int weights_offset=0); +}; + + +template +template < + bool Specialize, + int InPadTop, int InPadLeft, int InPadBottom, int InPadRight, + int OutPadBottom, int OutPadRight +> +void DepthwiseConvolutionImpl::process_tile( + const int n_channels, + const uint8_t *__restrict__ const weights, + const int weight_row_stride, + const int weight_col_stride, + const uint8_t *__restrict__ const inptr, + const int in_row_stride, + const int in_col_stride, + int32_t *__restrict__ const outptr, + const int out_row_stride, + const int out_col_stride, + const int _in_pad_top, + const int _in_pad_left, + const int _in_pad_bottom, + const int _in_pad_right, + const int _out_pad_bottom, + const int _out_pad_right, + const int _input_offset, + const int _weights_offset +) +{ + constexpr auto inner_tile_rows = DWC::inner_tile_rows; + constexpr auto inner_tile_cols = DWC::inner_tile_cols; + constexpr auto kernel_rows = DWC::kernel_rows; + constexpr auto kernel_cols = DWC::kernel_cols; + constexpr auto output_tile_rows = DWC::output_tile_rows; + constexpr auto output_tile_cols = DWC::output_tile_cols; + constexpr auto stride_rows = DWC::stride_rows; + constexpr auto stride_cols = DWC::stride_cols; + + // Extract parameters + const int in_pad_top = Specialize ? InPadTop : _in_pad_top; + const int in_pad_left = Specialize ? InPadLeft : _in_pad_left; + const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom; + const int in_pad_right = Specialize ? InPadRight : _in_pad_right; + const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom; + const int out_pad_right = Specialize ? OutPadRight : _out_pad_right; + + // Compute valid ranges of the tile + const int in_cells_i = inner_tile_rows - in_pad_bottom; + const int in_cells_j = inner_tile_cols - in_pad_right; + const int out_cells_i = output_tile_rows - out_pad_bottom; + const int out_cells_j = output_tile_cols - out_pad_right; + + // Instantiate pointers + const uint8_t* __restrict__ inptr_base = inptr; + const uint8_t* __restrict__ wptr_base = weights; + int32_t* __restrict__ outptr_base = outptr; + + // Perform the depthwise convolution + int channels_remaining = n_channels; +#ifdef __aarch64__ + const int32x4_t v_input_offset = vdupq_n_s32(_input_offset); + const int32x4_t v_weights_offset = vdupq_n_s32(_weights_offset); + for (; channels_remaining >= 16; channels_remaining -= 16) + { + // Load input tile + int32x4x4_t u[inner_tile_rows][inner_tile_cols]; + for (int i = 0; i < inner_tile_rows; i++) + { + const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride; + for (int j = 0; j < inner_tile_cols; j++) + { + if (i < in_pad_top || in_cells_i <= i || + j < in_pad_left || in_cells_j <= j) + { + u[i][j].val[0] = vdupq_n_s32(0); + u[i][j].val[1] = vdupq_n_s32(0); + u[i][j].val[2] = vdupq_n_s32(0); + u[i][j].val[3] = vdupq_n_s32(0); + } + else + { + const uint8x16_t uv = vld1q_u8(inptr_row + (j - in_pad_left)*in_col_stride); + u[i][j].val[0] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(uv))))); + u[i][j].val[1] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(uv))))); + u[i][j].val[2] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(uv))))); + u[i][j].val[3] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(uv))))); + } + } + } + inptr_base += 16; + + // Load weights tile + int32x4x4_t w[kernel_rows][kernel_cols]; + for (int i = 0; i < kernel_rows; i++) + { + const uint8_t* const wptr_row = wptr_base + i*weight_row_stride; + for (int j = 0; j < kernel_cols; j++) + { + const uint8x16_t wv = vld1q_u8(wptr_row + j*weight_col_stride); + w[i][j].val[0] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(wv))))); + w[i][j].val[1] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(wv))))); + w[i][j].val[2] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(wv))))); + w[i][j].val[3] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(wv))))); + } + } + wptr_base += 16; + + // Perform the convolution + int32x4x4_t v[output_tile_rows][output_tile_cols]; + for (int out_i = 0; out_i < out_cells_i; out_i++) + { + for (int out_j = 0; out_j < out_cells_j; out_j++) + { + // Base co-ordinate + const int base_i = out_i * stride_rows; + const int base_j = out_j * stride_cols; + + // Fill the accumulator + for (int in_i = 0; in_i < kernel_rows; in_i++) + { + const int i = base_i + in_i; + for (int in_j = 0; in_j < kernel_cols; in_j++) + { + const int j = base_j + in_j; + if (in_i == 0 && in_j == 0) + { + // v[out_i][out_j] = w[in_i][in_j] * u[i][j]; + v[out_i][out_j].val[0] = vmulq_s32(w[in_i][in_j].val[0], u[i][j].val[0]); + v[out_i][out_j].val[1] = vmulq_s32(w[in_i][in_j].val[1], u[i][j].val[1]); + v[out_i][out_j].val[2] = vmulq_s32(w[in_i][in_j].val[2], u[i][j].val[2]); + v[out_i][out_j].val[3] = vmulq_s32(w[in_i][in_j].val[3], u[i][j].val[3]); + } + else + { + // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; + v[out_i][out_j].val[0] = vmlaq_s32(v[out_i][out_j].val[0], w[in_i][in_j].val[0], u[i][j].val[0]); + v[out_i][out_j].val[1] = vmlaq_s32(v[out_i][out_j].val[1], w[in_i][in_j].val[1], u[i][j].val[1]); + v[out_i][out_j].val[2] = vmlaq_s32(v[out_i][out_j].val[2], w[in_i][in_j].val[2], u[i][j].val[2]); + v[out_i][out_j].val[3] = vmlaq_s32(v[out_i][out_j].val[3], w[in_i][in_j].val[3], u[i][j].val[3]); + } + } + } + } + } + + // Store the output tile + for (int i = 0; i < out_cells_i; i++) + { + int32_t* const outptr_row = outptr_base + i*out_row_stride; + for (int j = 0; j < out_cells_j; j++) + { + vst1q_s32(outptr_row + j*out_col_stride, v[i][j].val[0]); + vst1q_s32(outptr_row + j*out_col_stride + 4, v[i][j].val[1]); + vst1q_s32(outptr_row + j*out_col_stride + 8, v[i][j].val[2]); + vst1q_s32(outptr_row + j*out_col_stride + 12, v[i][j].val[3]); + } + } + outptr_base += 16; + } +#endif // __aarch64__ + for (; channels_remaining; channels_remaining--) + { + // Load input tile + int32_t u[inner_tile_rows][inner_tile_cols]; + for (int i = 0; i < inner_tile_rows; i++) + { + const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride; + for (int j = 0; j < inner_tile_cols; j++) + { + if (i < in_pad_top || in_cells_i <= i || + j < in_pad_left || in_cells_j <= j) + { + u[i][j] = static_cast(0); + } + else + { + u[i][j] = static_cast(*(inptr_row + (j - in_pad_left)*in_col_stride)) + _input_offset; + } + } + } + inptr_base++; + + // Load weights tile + int32_t w[kernel_rows][kernel_cols]; + for (int i = 0; i < kernel_rows; i++) + { + const uint8_t* const wptr_row = wptr_base + i*weight_row_stride; + for (int j = 0; j < kernel_cols; j++) + { + w[i][j] = static_cast(*(wptr_row + j*weight_col_stride)) + _weights_offset; + } + } + wptr_base++; + + // Perform the convolution + int32_t v[output_tile_rows][output_tile_cols]; + for (int out_i = 0; out_i < out_cells_i; out_i++) + { + for (int out_j = 0; out_j < out_cells_j; out_j++) + { + // Clear the accumulator + v[out_i][out_j] = static_cast(0); + + // Base co-ordinate + const int base_i = out_i * stride_rows; + const int base_j = out_j * stride_cols; + + // Fill the accumulator + for (int in_i = 0; in_i < kernel_rows; in_i++) + { + const int i = base_i + in_i; + for (int in_j = 0; in_j < kernel_cols; in_j++) + { + const int j = base_j + in_j; + v[out_i][out_j] += w[in_i][in_j] * u[i][j]; + } + } + } + } + + // Store the output tile + for (int i = 0; i < out_cells_i; i++) + { + int32_t* const outptr_row = outptr_base + i*out_row_stride; + for (int j = 0; j < out_cells_j; j++) + { + *(outptr_row + j*out_col_stride) = v[i][j]; + } + } + outptr_base++; + } +} + +} // namespace depthwise diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index a46be2ec92..9dcbc99332 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -59,8 +59,25 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we _is_nchw = input->info()->data_layout() == DataLayout::NCHW; _permute = _is_optimized == _is_nchw; + // Initialize the intermediate accumulator tensor in case of quantized input + if(_is_quantized) + { + TensorShape accum_shape = output->info()->tensor_shape(); + DataLayout accum_layout = output->info()->data_layout(); + if(!_is_optimized && !_is_nchw) + { + permute(accum_shape, PermutationVector(1U, 2U, 0U)); + accum_layout = DataLayout::NCHW; + } + + _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, input->info()->quantization_info())); + _accumulator.info()->set_data_layout(accum_layout); + zero_value = PixelValue(static_cast(input->info()->quantization_info().offset)); + } + if(_is_optimized) { + ITensor *optimized_output = (_is_quantized) ? &_accumulator : output; if(_is_nchw) { // Configure the function to transform the input tensor from NCHW -> NHWC @@ -75,8 +92,8 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC); // Configure the function to transform the convoluted output to ACL's native ordering format NCHW - _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U)); - _permuted_output.info()->set_data_layout(DataLayout::NCHW); + _permuted_output.info()->set_data_layout(DataLayout::NHWC); + _permute_output.configure(&_permuted_output, optimized_output, PermutationVector(1U, 2U, 0U)); // Allocate tensors _permuted_input.allocator()->allocate(); @@ -85,26 +102,11 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we } else { - _dwc_kernel.configure(input, weights, output, conv_info, depth_multiplier, DataLayout::NHWC); + _dwc_kernel.configure(input, weights, optimized_output, conv_info, depth_multiplier, DataLayout::NHWC); } } else { - // Allocate the intermediate accumulator tensor in case of quantized input - if(_is_quantized) - { - TensorShape accum_shape = output->info()->tensor_shape(); - - if(!_is_nchw) - { - permute(accum_shape, PermutationVector(1U, 2U, 0U)); - } - - _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32)); - _accumulator.info()->set_quantization_info(input->info()->quantization_info()); - zero_value = PixelValue(static_cast(input->info()->quantization_info().offset)); - } - if(!_is_nchw) { // Configure the function to transform the input tensor from NHWC -> NCHW @@ -143,7 +145,7 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale; int output_multiplier, output_shift; quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset); + _output_stage_kernel.configure(&_accumulator, biases, (_is_nchw || _is_optimized) ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset); _accumulator.allocator()->allocate(); } else if(_has_bias) diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h index 889473ecbc..5ef6e112aa 100644 --- a/tests/datasets/DepthwiseConvolutionLayerDataset.h +++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h @@ -204,6 +204,7 @@ public: // Stride 1 add_config(TensorShape(7U, 7U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL)); add_config(TensorShape(7U, 7U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)); + add_config(TensorShape(7U, 7U, 21U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)); add_config(TensorShape(28U, 28U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL)); add_config(TensorShape(28U, 28U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)); // Stride 2 @@ -211,6 +212,7 @@ public: add_config(TensorShape(7U, 7U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL)); add_config(TensorShape(8U, 8U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::FLOOR)); add_config(TensorShape(8U, 8U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL)); + add_config(TensorShape(8U, 8U, 33U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL)); add_config(TensorShape(64U, 64U, 128U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL)); } }; diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp index 54bce0252e..8f87a7d636 100644 --- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp @@ -304,6 +304,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture3x3< { validate(Accessor(_target), _reference, tolerance_qasymm8); } +FIXTURE_DATA_TEST_CASE(RunOptimized, NEDepthwiseConvolutionLayerQuantizedFixture3x3, framework::DatasetMode::ALL, + combine(combine(combine(combine(datasets::OptimizedDepthwiseConvolutionLayerDataset3x3(), + framework::dataset::make("DepthMultiplier", 1)), + framework::dataset::make("DataType", + DataType::QASYMM8)), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) +{ + validate(Accessor(_target), _reference, tolerance_qasymm8); +} FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture3x3, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(), depth_multipliers), diff --git a/utils/Utils.h b/utils/Utils.h index 0bbdcc25d1..130e1f72fe 100644 --- a/utils/Utils.h +++ b/utils/Utils.h @@ -357,7 +357,7 @@ public: void fill_tensor(T &tensor) { ARM_COMPUTE_ERROR_ON(!is_open()); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(&tensor, arm_compute::DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(&tensor, arm_compute::DataType::QASYMM8, arm_compute::DataType::S32, arm_compute::DataType::F32); try { // Map buffer if creating a CLTensor @@ -413,6 +413,8 @@ public: switch(tensor.info()->data_type()) { + case arm_compute::DataType::QASYMM8: + case arm_compute::DataType::S32: case arm_compute::DataType::F32: { // Read data -- cgit v1.2.1