COMPMID-643: Add bias to CLDepthwiseConvolution.

Change-Id: Ibfe7b8c1172d10cbcae7971fe86b82090519d31d Reviewed-on: http://mpd-gerrit.cambridge.arm.com/92798 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Jaroslaw Rzepecki <jaroslaw.rzepecki@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2017-10-23 20:29:30 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:35:24 +0000
commit: 81a26ad6b626ce2da83659d7c6c17b6104d1f203 (patch)
tree: 536807115771f1a4b06048892d1d4e17c98779de /src/core/CL
parent: 511347a7282b948bddfc071e9a8fa08e79da25b4 (diff)
download: ComputeLibrary-81a26ad6b626ce2da83659d7c6c17b6104d1f203.tar.gz
4 files changed, 113 insertions, 25 deletions
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 081a4e6c44..411e097dc8 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -169,14 +169,29 @@ inline float2 convolution3x3(
   * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
   * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
   * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
-  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+  * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16/F32
+  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
   */
 
-__kernel void depthwise_convolution_3x3(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), TENSOR3D_DECLARATION(weights))
+__kernel void depthwise_convolution_3x3(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
     Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif //defined(HAS_BIAS)
 
     uchar3 offset          = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
     float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
@@ -186,6 +201,9 @@ __kernel void depthwise_convolution_3x3(TENSOR3D_DECLARATION(src), TENSOR3D_DECL
     float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
                                    weights_values1.s0, weights_values1.s1, weights_values1.s2,
                                    weights_values2.s0, weights_values2.s1, weights_values2.s2);
+#if defined(HAS_BIAS)
+    pixels += (float2)(*((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x)));
+#endif //defined(HAS_BIAS)
 
     vstore2(pixels, 0, (__global float *)dst.ptr);
 }
@@ -197,24 +215,38 @@ __kernel void depthwise_convolution_3x3(TENSOR3D_DECLARATION(src), TENSOR3D_DECL
  *
  * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases vector. Supported data types: F16/F32
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
  */
-__kernel void depthwise_weights_reshape(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+__kernel void depthwise_weights_reshape(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst)
+#ifdef HAS_BIAS
+    ,
+    VECTOR_DECLARATION(biases)
+#endif /* HAS_BIAS */
+)
 {
     Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* HAS_BIAS */
 
     __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr;
     __global uchar *output_ptr    = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
@@ -223,6 +255,13 @@ __kernel void depthwise_weights_reshape(TENSOR3D_DECLARATION(src), IMAGE_DECLARA
     {
         *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;
     }
+
+#if defined(HAS_BIAS)
+    if(get_global_id(1) == 0)
+    {
+        *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));
+    }
+#endif // defined(HAS_BIAS)
 }
 #endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
 
@@ -279,6 +318,9 @@ __kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(d
             }
         }
     }
+#if defined(HAS_BIAS)
+    *output_ptr = (DATA_TYPE)(1);
+#endif // defined(HAS_BIAS)
 }
 
 #endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE)
diff --git a/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
index 6e56835115..2d0c416d0a 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
@@ -37,7 +37,7 @@
 using namespace arm_compute;
 
 CLDepthwiseConvolution3x3Kernel::CLDepthwiseConvolution3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0)
+    : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0)
 {
 }
 
@@ -46,13 +46,20 @@ BorderSize CLDepthwiseConvolution3x3Kernel::border_size() const
     return _border_size;
 }
 
-void CLDepthwiseConvolution3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolution3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const ICLTensor *biases, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
 
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
     std::pair<unsigned int, unsigned int> expected_output = scaled_dimensions(input->info()->tensor_shape().x(), input->info()->tensor_shape().y(),
                                                                               weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(),
                                                                               conv_info);
@@ -64,6 +71,7 @@ void CLDepthwiseConvolution3x3Kernel::configure(const ICLTensor *input, ICLTenso
     _input         = input;
     _output        = output;
     _weights       = weights;
+    _biases        = biases;
     _conv_stride_x = conv_info.stride().first;
     _conv_stride_y = conv_info.stride().second;
     _conv_pad_x    = conv_info.pad().first;
@@ -73,6 +81,10 @@ void CLDepthwiseConvolution3x3Kernel::configure(const ICLTensor *input, ICLTenso
     // Set build options
     ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
     std::set<std::string> options{ "-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x) };
+    if(_biases != nullptr)
+    {
+        options.emplace("-DHAS_BIAS");
+    }
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_convolution_3x3", options));
 
@@ -111,6 +123,15 @@ void CLDepthwiseConvolution3x3Kernel::run(const Window &window, cl::CommandQueue
     slice_weights.set_dimension_step(Window::DimX, 0);
     slice_weights.set_dimension_step(Window::DimY, 0);
 
+    // Set biases
+    if(_biases != nullptr)
+    {
+        unsigned int idx = 3 * num_arguments_per_3D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+        add_1D_tensor_argument(idx, _biases, slice_biases);
+    }
+
     do
     {
         unsigned int idx = 0;
diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index 5c7fe7e0b4..743cd4a38f 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
@@ -41,13 +41,13 @@ CLDepthwiseIm2ColKernel::CLDepthwiseIm2ColKernel()
 {
 }
 
-void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info)
+void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
 
     _input  = input;
     _output = output;
@@ -66,7 +66,10 @@ void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *outpu
     build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
     build_opts.emplace("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
     build_opts.emplace("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
-
+    if(has_bias)
+    {
+        build_opts.emplace("-DHAS_BIAS");
+    }
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_im2col", build_opts));
 
     // Configure  kernel window
diff --git a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
index 68de68b4c5..81dd6b42cc 100644
--- a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
@@ -35,19 +35,28 @@
 using namespace arm_compute;
 
 CLDepthwiseWeightsReshapeKernel::CLDepthwiseWeightsReshapeKernel()
-    : _input(nullptr), _output(nullptr)
+    : _input(nullptr), _biases(nullptr), _output(nullptr)
 {
 }
 
-void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * input->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != input->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
 
     _input  = input;
+    _biases = biases;
     _output = output;
 
     // Create kernel
@@ -55,6 +64,10 @@ void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTenso
 
     build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+    if(_biases != nullptr)
+    {
+        build_opts.emplace("-DHAS_BIAS");
+    }
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_weights_reshape", build_opts));
 
@@ -84,6 +97,15 @@ void CLDepthwiseWeightsReshapeKernel::run(const Window &window, cl::CommandQueue
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
+    // Set biases
+    if(_biases != nullptr)
+    {
+        unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+        add_1D_tensor_argument(idx, _biases, slice_biases);
+    }
+
     do
     {
         unsigned int idx = 0;
author	Georgios Pinitas <georgios.pinitas@arm.com>	2017-10-23 20:29:30 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:35:24 +0000
commit	81a26ad6b626ce2da83659d7c6c17b6104d1f203 (patch)
tree	536807115771f1a4b06048892d1d4e17c98779de /src/core/CL
parent	511347a7282b948bddfc071e9a8fa08e79da25b4 (diff)
download	ComputeLibrary-81a26ad6b626ce2da83659d7c6c17b6104d1f203.tar.gz