From 8cffcd6b6e4e95f97767f2a25ccc8826dd69c358 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 16 Nov 2018 17:11:50 +0000 Subject: COMPMID-1644: NEDepthwiseConvolution for FP16 NHWC Change-Id: I6e7dee8bd615a5eff01c523f208a218574ee5eab --- .../core/NEON/kernels/NEDepthwiseIm2ColKernel.h | 6 +-- .../NEON/kernels/NEDepthwiseVectorToTensorKernel.h | 4 +- .../NEON/kernels/NEDepthwiseWeightsReshapeKernel.h | 8 ++-- .../kernels/NEGEMMMatrixVectorMultiplyKernel.h | 4 +- .../NEON/functions/NEDepthwiseConvolutionLayer.h | 6 +-- .../kernels/NEGEMMMatrixVectorMultiplyKernel.cpp | 53 ++++++++++++++++++++-- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 2 +- .../validation/NEON/DepthwiseConvolutionLayer.cpp | 29 ++++++++++-- 8 files changed, 92 insertions(+), 20 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h index 0d61d3ea38..de671361d6 100644 --- a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h @@ -55,7 +55,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8, F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 * @param[out] output The output tensor. First 3 lower dimensions represent a transform of each 3D input, * while every dimension above 3 represents a batch. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). @@ -68,7 +68,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseIm2ColKernel * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8, F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 * @param[in] output The output tensor. First 3 lower dimensions represent a transform of each 3D input, * while every dimension above 3 represents a batch. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). @@ -105,5 +105,5 @@ private: bool _has_bias; unsigned int _depth_multiplier; }; -} // arm_compute +} // namespace arm_compute #endif /*__ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h index 00977a91b4..25af7a29cc 100644 --- a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h @@ -56,7 +56,7 @@ public: NEDepthwiseVectorToTensorKernel &operator=(NEDepthwiseVectorToTensorKernel &&) = default; /** Set the input and output of the kernel. * - * @param[in] input The input vector to convert. Data type supported: QASYMM8/S32/F32. + * @param[in] input The input vector to convert. Data type supported: QASYMM8/S32/F16/F32. * @param[out] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input. * @param[in] conv_w The converted tensor's width. * @param[in] conv_h The converted tensor's height. @@ -64,7 +64,7 @@ public: void configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseVectorToTensorKernel * - * @param[in] input The input vector to convert. Data type supported: QASYMM8/S32/F32. + * @param[in] input The input vector to convert. Data type supported: QASYMM8/S32/F16/F32. * @param[in] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input. * @param[in] conv_w The converted tensor's width. * @param[in] conv_h The converted tensor's height. diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h index b78684f993..dcf52442a9 100644 --- a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h @@ -53,7 +53,8 @@ public: NEDepthwiseWeightsReshapeKernel &operator=(NEDepthwiseWeightsReshapeKernel &&) = default; /** Set the input and output of the kernel. * - * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: QASYMM8, F32. + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. + * Data type supported: QASYMM8/F16/F32. * @param[out] output The output tensor. Data type supported: same as @p input. * @param[in] biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input. */ @@ -61,7 +62,8 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseWeightsReshapeKernel * - * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: QASYMM8, F32. + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. + * Data type supported: QASYMM8/F16/F32. * @param[in] output The output tensor. Data type supported: same as @p input. * @param[in] biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input. * @@ -81,5 +83,5 @@ private: ITensor *_output; const ITensor *_biases; }; -} // arm_compute +} // namespace arm_compute #endif /*__ARM_COMPUTE_NEDEPTHWISEWEIGHTSRESHAPEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h index 7dddaca3a0..c355875c24 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h @@ -50,7 +50,7 @@ public: NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F32 + * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F16/F32 * @param[in] input1 Second Input tensor. Data types supported: same as @p input. * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input. */ @@ -58,7 +58,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixVectorMultiplyKernel * - * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F32 + * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F16/F32 * @param[in] input1 Second Input tensor. Data types supported: same as @p input. * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input. * diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index 288d5136d2..e2fe11ea7f 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -132,7 +132,7 @@ public: NEDepthwiseConvolutionLayer &operator=(NEDepthwiseConvolutionLayer &&) = default; /** Initialize the function's source, destination, weights and convolution information. * - * @param[in, out] input Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling). + * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. * @param[in] biases (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. @@ -146,7 +146,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer * - * @param[in] input Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling). + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). * @param[in] output Destination tensor. Data type supported: same as @p input. * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. * @param[in] biases (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. @@ -189,5 +189,5 @@ private: bool _is_activationlayer_enabled; const ITensor *_original_weights; }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H__ */ \ No newline at end of file diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp index 238786953b..3a1595a0c9 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp @@ -43,11 +43,11 @@ namespace { Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32)); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != DataType::F32)); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != input0->data_type())); ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1)); @@ -87,6 +87,48 @@ void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &wind namespace arm_compute { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, + const Window &window_w, + const Window &window_out) +{ + Iterator in(_input0, window_in); + Iterator in2(_input1, window_w); + Iterator out(_output, window_out); + + const int input_w = _input0->info()->dimension(0); + const int input_h = _input0->info()->dimension(1); + const int input_stride_x = _input0->info()->strides_in_bytes().x(); + const int weights_stride_x = _input1->info()->strides_in_bytes().x(); + const int weights_stride_y = _input1->info()->strides_in_bytes().y(); + const int output_stride_x = _output->info()->strides_in_bytes().x(); + + execute_window_loop(window_in, [&](const Coordinates & id) + { + // Get pointers + const uint8_t *const input_ptr = in.ptr(); + const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y; + auto output_ptr = reinterpret_cast<__fp16 *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x); + + float16x8_t row_dot = vdupq_n_f16(0.f); + for(int i = 0; i < input_w; i += 8) + { + const auto input = vld1q_f16(reinterpret_cast(input_ptr + i * input_stride_x)); + const auto weights = vld1q_f16(reinterpret_cast(weights_ptr + i * weights_stride_x)); + row_dot = vaddq_f16(row_dot, vmulq_f16(input, weights)); + } + + auto temp = vadd_f16(vget_high_f16(row_dot), vget_low_f16(row_dot)); + temp = vpadd_f16(temp, temp); + temp = vpadd_f16(temp, temp); + + *output_ptr = vget_lane_f16(temp, 0); + }, + in, in2, out); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + template <> void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, const Window &window_w, @@ -226,6 +268,11 @@ void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const IT case DataType::QASYMM8: _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply; break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply; + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::F32: _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply; break; diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index a2f0094f9d..db7f9af420 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -271,7 +271,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_UNUSED(channel_idx); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx)); diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp index 8f87a7d636..f2b4650527 100644 --- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp @@ -45,9 +45,12 @@ using namespace arm_compute::misc::shape_calculator; namespace { -RelativeTolerance tolerance_f16(half_float::half(0.001)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */ -constexpr RelativeTolerance tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */ -constexpr AbsoluteTolerance tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */ +constexpr RelativeTolerance tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */ +constexpr AbsoluteTolerance tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +RelativeTolerance tolerance_f16(half_float::half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */ +constexpr float tolerance_num = 0.05f; /**< Tolerance number */ +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 3 }); } // namespace @@ -244,6 +247,26 @@ TEST_SUITE_END() // F32 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_SUITE(F16) +TEST_SUITE(Generic) +template +using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture; +FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), + depth_multipliers), + framework::dataset::make("DataType", + DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) +{ + validate(Accessor(_target), _reference, tolerance_f16, tolerance_num); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(), + depth_multipliers), + framework::dataset::make("DataType", + DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) +{ + validate(Accessor(_target), _reference, tolerance_f16, tolerance_num); +} +TEST_SUITE_END() // Generic TEST_SUITE(W3x3) template using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture; -- cgit v1.2.1