From d24af8a3e8f7cbd38fd3142056241c0c9f63e46a Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Tue, 8 May 2018 17:23:52 +0100
Subject: COMPMID-1125: Add support for FP16 in CLDepthwiseConvolution

Change-Id: I4838f5a8e4c33ed646cd05e0bb682fca635a29a3
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/130469
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
---
 .../core/CL/kernels/CLDepthwiseIm2ColKernel.h      |  2 +-
 .../CL/kernels/CLDepthwiseWeightsReshapeKernel.h   |  2 +-
 arm_compute/core/CL/kernels/CLFillBorderKernel.h   |  4 +-
 src/core/CL/cl_kernels/depthwise_convolution.cl    |  2 +-
 src/core/CL/cl_kernels/fill_border.cl              |  6 +--
 .../CL/functions/CLDepthwiseConvolutionLayer.cpp   |  2 +-
 tests/validation/CL/DepthwiseConvolutionLayer.cpp  | 57 ++++++++++++++--------
 7 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h
index b8343da50a..3f3e36100a 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h
@@ -51,7 +51,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input            The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                              while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F32
+     *                              while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
      * @param[out] output           The output tensor. First 3 lower dimensions represent a transform of each 3D input,
      *                              while every dimension above 3 represents a batch. Data types supported: Same as @p input
      * @param[in]  kernel_dims      The kernel dimensions (width and height).
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h
index c8d1e2f1ad..1c1eaca474 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h
@@ -51,7 +51,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM].
-     *                    Data type supported: QASYMM8/F32.
+     *                    Data type supported: QASYMM8/F16/F32.
      * @param[out] output The output tensor. Data type supported: same as @p input.
      * @param[in]  biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input.
      */
diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
index dc57978ae1..18031c7e7e 100644
--- a/arm_compute/core/CL/kernels/CLFillBorderKernel.h
+++ b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,7 @@ public:
 
     /** Initialise the kernel's input, output and border mode.
      *
-     * @param[in,out] tensor                Tensor to process Data types supported: U8/QS8/S16/QS16/S32/F32.
+     * @param[in,out] tensor                Tensor to process Data types supported: U8/QS8/S16/QS16/S32/F16/F32.
      * @param[in]     border_size           Size of the border to fill in elements.
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 21c28539ef..5f4247e5d3 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -500,7 +500,7 @@ __kernel void depthwise_weights_reshape(
 #if defined(HAS_BIAS)
     if(get_global_id(1) == 0)
     {
-        *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));
+        *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
     }
 #endif // defined(HAS_BIAS)
 }
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
index fbd4f6ae0f..33a9495d66 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/fill_border.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@
  * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
  * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
  *
- * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
  * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -110,7 +110,7 @@ __kernel void fill_image_borders_replicate(
  * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
  * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
  *
- * @param[out] buf_ptr                           Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
  * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index ea2f93b85d..88bb0c417d 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -80,7 +80,7 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
 
 void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != weights->info()->dimension(2));
 
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index ad7a5d819b..54b7925a09 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -45,6 +45,7 @@ namespace
 RelativeTolerance<half_float::half>  tolerance_f16(half_float::half(0.001)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr RelativeTolerance<float>   tolerance_f32(0.01f);                   /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);                   /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+constexpr float                      tolerance_num = 0.05f;                  /**< Tolerance number */
 
 const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 3 });
 } // namespace
@@ -55,29 +56,11 @@ TEST_SUITE(DepthwiseConvolutionLayer)
 template <typename T>
 using CLDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T>;
 
-TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::F32)),
-                                                                                                                 framework::dataset::make("DataLayout", DataLayout::NCHW)))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
-                                                                                                                     depth_multipliers),
-                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", DataLayout::NCHW)))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END()
-
 template <typename T>
 using CLDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer3x3, T>;
 
 TEST_SUITE(Float)
-TEST_SUITE(F16)
+TEST_SUITE(FP16)
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL,
                        combine(combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
@@ -98,6 +81,24 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<half>, fr
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 TEST_SUITE_END()
+
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers),
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::F16)),
+                                                                                                                framework::dataset::make("DataLayout", DataLayout::NCHW)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                                                                                    depth_multipliers),
+                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                            DataType::F16)),
+                                                                                                                    framework::dataset::make("DataLayout", DataLayout::NCHW)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END()
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
@@ -121,6 +122,24 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<float>, f
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE_END()
+
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers),
+                                                                                                                 framework::dataset::make("DataType",
+                                                                                                                         DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", DataLayout::NCHW)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                                                                                     depth_multipliers),
+                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                             DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", DataLayout::NCHW)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END()
 TEST_SUITE_END()
 TEST_SUITE_END()
 
-- 
cgit v1.2.1