From 20c246a60869bada4051bd14eb9a3862be5330d7 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 12 Sep 2018 16:45:53 +0100 Subject: COMPMID-1532: Add DepthwiseConvolution3x3 FP16 on NEON Change-Id: I780970f317b979b3230e2b471ac01df7fda9ee14 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/148168 Tested-by: bsgcomp Reviewed-by: Anthony Barbier --- .../NEDepthwiseConvolutionLayer3x3Kernel.cpp | 107 +++++++++++++++------ 1 file changed, 75 insertions(+), 32 deletions(-) (limited to 'src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp') diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp index 88758b523a..7029b06615 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp @@ -146,7 +146,7 @@ inline void convolve_3x3(const Window &window, unsigned int num_elems_written_pe Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); const DataLayout data_layout = input->data_layout(); @@ -165,8 +165,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - //ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (output->data_type() != DataType::S32)); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input->data_type()) && (output->data_type() != DataType::F32)); + if(is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } } return Status{}; @@ -229,6 +235,11 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen case DataType::QASYMM8: num_elems_read_per_iteration = 16; break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + num_elems_read_per_iteration = 24; + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: num_elems_read_per_iteration = 12; break; @@ -313,7 +324,7 @@ bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(Tenso } // Check supported data type - bool supported_datatype = (dt == DataType::F32); + bool supported_datatype = is_data_type_float(dt); // Check for supported strides const auto &strides = conv_info.stride(); @@ -334,7 +345,7 @@ bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(Tenso void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver() { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights); ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3); @@ -371,6 +382,11 @@ void NEDepthwiseConvolutionLayer3x3Kernel::run_generic(const Window &window, con switch(_input->info()->data_type()) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + convolve_3x3(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: convolve_3x3(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier); break; @@ -398,6 +414,7 @@ std::unique_ptr NEDepthwiseConvolutionLayer3x3 ITensor *out, bool setup_strides) { + const DataType dt = in->info()->data_type(); const TensorShape shape = in->info()->tensor_shape(); const int in_rows = shape.z(); const int in_cols = shape.y(); @@ -414,34 +431,60 @@ std::unique_ptr NEDepthwiseConvolutionLayer3x3 const int output_batch_stride = (setup_strides) ? out->info()->strides_in_bytes()[3] / out->info()->element_size() : 0; const auto stride_x = conv_info.stride().first; - switch(stride_x) + switch(dt) { - case 1: - return arm_compute::support::cpp14::make_unique>( - n_batches, - in_rows, - in_cols, - n_channels, - padding_same, - reinterpret_cast(w->ptr_to_element(Coordinates())), - reinterpret_cast(in->ptr_to_element(Coordinates())), - reinterpret_cast(out->ptr_to_element(Coordinates())), - weight_col_stride, weight_row_stride, - input_col_stride, input_row_stride, input_batch_stride, - output_col_stride, output_row_stride, output_batch_stride); - case 2: - return arm_compute::support::cpp14::make_unique>( - n_batches, - in_rows, - in_cols, - n_channels, - padding_same, - reinterpret_cast(w->ptr_to_element(Coordinates())), - reinterpret_cast(in->ptr_to_element(Coordinates())), - reinterpret_cast(out->ptr_to_element(Coordinates())), - weight_col_stride, weight_row_stride, - input_col_stride, input_row_stride, input_batch_stride, - output_col_stride, output_row_stride, output_batch_stride); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + switch(stride_x) + { + case 1: + return arm_compute::support::cpp14::make_unique>( + n_batches, in_rows, in_cols, n_channels, padding_same, + reinterpret_cast(w->ptr_to_element(Coordinates())), + reinterpret_cast(in->ptr_to_element(Coordinates())), + reinterpret_cast(out->ptr_to_element(Coordinates())), weight_col_stride, + weight_row_stride, input_col_stride, input_row_stride, input_batch_stride, + output_col_stride, output_row_stride, output_batch_stride); + case 2: + return arm_compute::support::cpp14::make_unique>( + n_batches, in_rows, in_cols, n_channels, padding_same, + reinterpret_cast(w->ptr_to_element(Coordinates())), + reinterpret_cast(in->ptr_to_element(Coordinates())), + reinterpret_cast(out->ptr_to_element(Coordinates())), weight_col_stride, + weight_row_stride, input_col_stride, input_row_stride, input_batch_stride, + output_col_stride, output_row_stride, output_batch_stride); + default: + return nullptr; + } + break; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + { + switch(stride_x) + { + case 1: + return arm_compute::support::cpp14::make_unique>( + n_batches, in_rows, in_cols, n_channels, padding_same, + reinterpret_cast(w->ptr_to_element(Coordinates())), + reinterpret_cast(in->ptr_to_element(Coordinates())), + reinterpret_cast(out->ptr_to_element(Coordinates())), weight_col_stride, + weight_row_stride, input_col_stride, input_row_stride, input_batch_stride, + output_col_stride, output_row_stride, output_batch_stride); + case 2: + return arm_compute::support::cpp14::make_unique>( + n_batches, in_rows, in_cols, n_channels, padding_same, + reinterpret_cast(w->ptr_to_element(Coordinates())), + reinterpret_cast(in->ptr_to_element(Coordinates())), + reinterpret_cast(out->ptr_to_element(Coordinates())), weight_col_stride, + weight_row_stride, input_col_stride, input_row_stride, input_batch_stride, + output_col_stride, output_row_stride, output_batch_stride); + default: + return nullptr; + } + break; + } default: return nullptr; } -- cgit v1.2.1