From 8cffcd6b6e4e95f97767f2a25ccc8826dd69c358 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 16 Nov 2018 17:11:50 +0000
Subject: COMPMID-1644: NEDepthwiseConvolution for FP16 NHWC

Change-Id: I6e7dee8bd615a5eff01c523f208a218574ee5eab
---
 .../core/NEON/kernels/NEDepthwiseIm2ColKernel.h    |  6 +--
 .../NEON/kernels/NEDepthwiseVectorToTensorKernel.h |  4 +-
 .../NEON/kernels/NEDepthwiseWeightsReshapeKernel.h |  8 ++--
 .../kernels/NEGEMMMatrixVectorMultiplyKernel.h     |  4 +-
 .../NEON/functions/NEDepthwiseConvolutionLayer.h   |  6 +--
 .../kernels/NEGEMMMatrixVectorMultiplyKernel.cpp   | 53 ++++++++++++++++++++--
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp |  2 +-
 .../validation/NEON/DepthwiseConvolutionLayer.cpp  | 29 ++++++++++--
 8 files changed, 92 insertions(+), 20 deletions(-)

diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
index 0d61d3ea38..de671361d6 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
@@ -55,7 +55,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input            The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                              while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8, F32
+     *                              while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
      * @param[out] output           The output tensor. First 3 lower dimensions represent a transform of each 3D input,
      *                              while every dimension above 3 represents a batch. Data types supported: Same as @p input
      * @param[in]  kernel_dims      The kernel dimensions (width and height).
@@ -68,7 +68,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseIm2ColKernel
      *
      * @param[in] input            The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8, F32
+     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
      * @param[in] output           The output tensor. First 3 lower dimensions represent a transform of each 3D input,
      *                             while every dimension above 3 represents a batch. Data types supported: Same as @p input
      * @param[in] kernel_dims      The kernel dimensions (width and height).
@@ -105,5 +105,5 @@ private:
     bool                       _has_bias;
     unsigned int               _depth_multiplier;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
index 00977a91b4..25af7a29cc 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
@@ -56,7 +56,7 @@ public:
     NEDepthwiseVectorToTensorKernel &operator=(NEDepthwiseVectorToTensorKernel &&) = default;
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input vector to convert. Data type supported: QASYMM8/S32/F32.
+     * @param[in]  input  The input vector to convert. Data type supported: QASYMM8/S32/F16/F32.
      * @param[out] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input.
      * @param[in]  conv_w The converted tensor's width.
      * @param[in]  conv_h The converted tensor's height.
@@ -64,7 +64,7 @@ public:
     void configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseVectorToTensorKernel
      *
-     * @param[in] input  The input vector to convert. Data type supported: QASYMM8/S32/F32.
+     * @param[in] input  The input vector to convert. Data type supported: QASYMM8/S32/F16/F32.
      * @param[in] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input.
      * @param[in] conv_w The converted tensor's width.
      * @param[in] conv_h The converted tensor's height.
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
index b78684f993..dcf52442a9 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
@@ -53,7 +53,8 @@ public:
     NEDepthwiseWeightsReshapeKernel &operator=(NEDepthwiseWeightsReshapeKernel &&) = default;
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: QASYMM8, F32.
+     * @param[in]  input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM].
+     *                    Data type supported: QASYMM8/F16/F32.
      * @param[out] output The output tensor. Data type supported: same as @p input.
      * @param[in]  biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input.
      */
@@ -61,7 +62,8 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseWeightsReshapeKernel
      *
-     * @param[in] input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: QASYMM8, F32.
+     * @param[in] input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM].
+     *                   Data type supported: QASYMM8/F16/F32.
      * @param[in] output The output tensor. Data type supported: same as @p input.
      * @param[in] biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input.
      *
@@ -81,5 +83,5 @@ private:
     ITensor                         *_output;
     const ITensor                   *_biases;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEDEPTHWISEWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
index 7dddaca3a0..c355875c24 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
@@ -50,7 +50,7 @@ public:
     NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input0 First Input tensor. Data types supported: QASYMM8/F32
+     * @param[in]  input0 First Input tensor. Data types supported: QASYMM8/F16/F32
      * @param[in]  input1 Second Input tensor. Data types supported: same as @p input.
      * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input.
      */
@@ -58,7 +58,7 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixVectorMultiplyKernel
      *
-     * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F32
+     * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F16/F32
      * @param[in] input1 Second Input tensor. Data types supported: same as @p input.
      * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input.
      *
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 288d5136d2..e2fe11ea7f 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -132,7 +132,7 @@ public:
     NEDepthwiseConvolutionLayer &operator=(NEDepthwiseConvolutionLayer &&) = default;
     /** Initialize the function's source, destination, weights and convolution information.
      *
-     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling).
+     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
      * @param[out]     output           Destination tensor. Data type supported: same as @p input.
      * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
      * @param[in]      biases           (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
@@ -146,7 +146,7 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer
      *
-     * @param[in] input            Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling).
+     * @param[in] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
      * @param[in] output           Destination tensor. Data type supported: same as @p input.
      * @param[in] weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
      * @param[in] biases           (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
@@ -189,5 +189,5 @@ private:
     bool                                      _is_activationlayer_enabled;
     const ITensor                            *_original_weights;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H__ */
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index 238786953b..3a1595a0c9 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
@@ -43,11 +43,11 @@ namespace
 {
 Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != DataType::F32));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != input0->data_type()));
 
     ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
@@ -87,6 +87,48 @@ void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &wind
 
 namespace arm_compute
 {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>(const Window &window_in,
+                                                                                const Window &window_w,
+                                                                                const Window &window_out)
+{
+    Iterator in(_input0, window_in);
+    Iterator in2(_input1, window_w);
+    Iterator out(_output, window_out);
+
+    const int input_w          = _input0->info()->dimension(0);
+    const int input_h          = _input0->info()->dimension(1);
+    const int input_stride_x   = _input0->info()->strides_in_bytes().x();
+    const int weights_stride_x = _input1->info()->strides_in_bytes().x();
+    const int weights_stride_y = _input1->info()->strides_in_bytes().y();
+    const int output_stride_x  = _output->info()->strides_in_bytes().x();
+
+    execute_window_loop(window_in, [&](const Coordinates & id)
+    {
+        // Get pointers
+        const uint8_t *const input_ptr   = in.ptr();
+        const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
+        auto                 output_ptr  = reinterpret_cast<__fp16 *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
+
+        float16x8_t row_dot = vdupq_n_f16(0.f);
+        for(int i = 0; i < input_w; i += 8)
+        {
+            const auto input   = vld1q_f16(reinterpret_cast<const __fp16 *>(input_ptr + i * input_stride_x));
+            const auto weights = vld1q_f16(reinterpret_cast<const __fp16 *>(weights_ptr + i * weights_stride_x));
+            row_dot            = vaddq_f16(row_dot, vmulq_f16(input, weights));
+        }
+
+        auto temp = vadd_f16(vget_high_f16(row_dot), vget_low_f16(row_dot));
+        temp      = vpadd_f16(temp, temp);
+        temp      = vpadd_f16(temp, temp);
+
+        *output_ptr = vget_lane_f16(temp, 0);
+    },
+    in, in2, out);
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
 template <>
 void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>(const Window &window_in,
                                                                                    const Window &window_w,
@@ -226,6 +268,11 @@ void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const IT
         case DataType::QASYMM8:
             _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>;
             break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>;
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
             _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>;
             break;
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index a2f0094f9d..db7f9af420 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -271,7 +271,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
     const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_UNUSED(channel_idx);
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
 
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 8f87a7d636..f2b4650527 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -45,9 +45,12 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-RelativeTolerance<half_float::half>  tolerance_f16(half_float::half(0.001)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
-constexpr RelativeTolerance<float>   tolerance_f32(0.01f);                   /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);                   /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+constexpr RelativeTolerance<float>   tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr float                     tolerance_num = 0.05f;                 /**< Tolerance number */
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 3 });
 } // namespace
@@ -244,6 +247,26 @@ TEST_SUITE_END() // F32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
+TEST_SUITE(Generic)
+template <typename T>
+using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                                                                                      depth_multipliers),
+                                                                                                                      framework::dataset::make("DataType",
+                                                                                                                              DataType::F16)),
+                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                                                                                    depth_multipliers),
+                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                            DataType::F16)),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
-- 
cgit v1.2.1