4 files changed, 71 insertions, 16 deletions
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index 826a386557..f3d06ed481 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -45,7 +45,7 @@ NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel()
 
 void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(biases, accum);
     ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
@@ -109,6 +109,27 @@ void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window)
             in0_out, in1);
             break;
         }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+        case DataType::F16:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const float16x8x2_t accum  = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr()));
+                const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr()));
+                const float16x8x2_t res =
+                {
+                    {
+                        vaddq_f16(accum.val[0], biases.val[0]),
+                        vaddq_f16(accum.val[1], biases.val[1])
+                    }
+                };
+
+                vst2q_f16(reinterpret_cast<float16_t *>(in0_out.ptr()), res);
+            },
+            in0_out, in1);
+            break;
+        }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         case DataType::QS8:
         {
             execute_window_loop(window, [&](const Coordinates & id)
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index eb84ccaddc..4d9ee85f9b 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -39,7 +39,7 @@ NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights()
 
 void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
     ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
@@ -196,7 +196,7 @@ void NEFullyConnectedLayer::configure_fc_fc_nb(const ITensor *input, const ITens
 
 void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
diff --git a/tests/validation/NEON/FullyConnectedLayer.cpp b/tests/validation/NEON/FullyConnectedLayer.cpp
index 87e0071007..fa962787d1 100644
--- a/tests/validation/NEON/FullyConnectedLayer.cpp
+++ b/tests/validation/NEON/FullyConnectedLayer.cpp
@@ -45,6 +45,9 @@ namespace
 {
 const float tolerance_f32 = 1e-03f; /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 const float tolerance_q   = 1.0f;   /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
+#ifdef ARM_COMPUTE_ENABLE_FP16
+const float tolerance_f16 = 0.01f; /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+#endif                             /*ARM_COMPUTE_ENABLE_FP16*/
 
 Tensor compute_fully_connected_layer(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, DataType dt,
                                      bool transpose_weights, int fixed_point_position)
@@ -82,7 +85,7 @@ Tensor compute_fully_connected_layer(const TensorShape &input_shape, const Tenso
     BOOST_TEST(!dst.info()->is_resizable());
 
     // Fill tensors
-    if(dt == DataType::F32)
+    if(dt == DataType::F16 || dt == DataType::F32)
     {
         std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
         library->fill(NEAccessor(src), distribution, 0);
@@ -153,6 +156,25 @@ BOOST_DATA_TEST_CASE(Configuration,
     validate(dst.info()->valid_region(), dst_valid_region);
 }
 
+#ifdef ARM_COMPUTE_ENABLE_FP16
+BOOST_AUTO_TEST_SUITE(Float16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall,
+                     SmallFullyConnectedLayerDataset() * boost::unit_test::data::make({ DataType::F16 }),
+                     fc_set, dt)
+{
+    // Compute function
+    Tensor dst = compute_fully_connected_layer(fc_set.src_shape, fc_set.weights_shape, fc_set.bias_shape, fc_set.dst_shape, dt, fc_set.transpose_weights, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fully_connected_layer(fc_set.src_shape, fc_set.weights_shape, fc_set.bias_shape, fc_set.dst_shape, dt, fc_set.transpose_weights, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_f16);
+}
+BOOST_AUTO_TEST_SUITE_END()
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
 BOOST_AUTO_TEST_SUITE(Float)
 BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
 BOOST_DATA_TEST_CASE(RunSmall,
diff --git a/tests/validation/Reference.cpp b/tests/validation/Reference.cpp
index 62dfcba37e..04362f0dc1 100644
--- a/tests/validation/Reference.cpp
+++ b/tests/validation/Reference.cpp
@@ -506,18 +506,30 @@ RawTensor Reference::compute_reference_convolution_layer(const TensorShape &inpu
     RawTensor ref_dst     = library->get(output_shape, dt, 1, fixed_point_position);
 
     // Fill reference
-    if(dt == DataType::F16 || dt == DataType::F32)
-    {
-        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
-        library->fill(ref_src, distribution, 0);
-        library->fill(ref_weights, distribution, 1);
-        library->fill(ref_bias, distribution, 2);
-    }
-    else
+    switch(dt)
     {
-        library->fill_tensor_uniform(ref_src, 0);
-        library->fill_tensor_uniform(ref_weights, 1);
-        library->fill_tensor_uniform(ref_bias, 2);
+        case DataType::F32:
+        case DataType::F16:
+        {
+            std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+            library->fill(ref_src, distribution, 0);
+            library->fill(ref_weights, distribution, 1);
+            library->fill(ref_bias, distribution, 2);
+            break;
+        }
+        case DataType::QS16:
+        case DataType::QS8:
+        {
+            library->fill_tensor_uniform(ref_src, 0);
+            library->fill_tensor_uniform(ref_weights, 1);
+            library->fill_tensor_uniform(ref_bias, 2);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+        }
     }
 
     // Compute reference
@@ -546,7 +558,7 @@ RawTensor Reference::compute_reference_fully_connected_layer(const TensorShape &
     RawTensor ref_weights = library->get(ws, dt, 1, fixed_point_position);
 
     // Fill reference
-    if(dt == DataType::F32)
+    if(dt == DataType::F16 || dt == DataType::F32)
     {
         std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
         library->fill(ref_src, distribution, 0);