APPBROWSER-312 Fully connected performance optimization

Change-Id: Ie93fd630ebbad7b6ca8812cb5044b3f1908b45fd Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111830 Reviewed-by: Stephen Li <stephen.li@arm.com> Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Frank Lei <frank.lei@arm.com> 2017-12-05 10:43:33 +0800
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:42:17 +0000
commit: b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f (patch)
tree: 89a4b81430100a4a91902d5987ae42edc438012c /src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
parent: 397d58aa40b02a26923c34d8cd4ba274eac45963 (diff)
download: ComputeLibrary-b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f.tar.gz
1 files changed, 58 insertions, 9 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 1a0c9f1d30..87a109adc0 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
@@ -25,14 +25,6 @@
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 #include "helpers.h"
 
-#ifdef DATA_TYPE_FP16
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(dst, 2, uint, restrict);
-#else  // DATA_TYPE_FP16
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, restrict);
-#endif // DATA_TYPE_FP16
-
 layout(std140) uniform shader_params
 {
 #ifdef IM2COL_GENERIC
@@ -58,10 +50,21 @@ layout(std140) uniform shader_params
 };
 
 #ifdef DATA_TYPE_FP16
+#if defined(IM2COL_REDUCED_8X)
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, restrict);
+#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, restrict);
+#else                            /* IM2COL_REDUCED_8X */
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, restrict);
+#endif                           /* IM2COL_REDUCED_8X */
 
 precision mediump float;
 
 #ifdef IM2COL_REDUCED
+#if defined(IM2COL_REDUCED_GENERIC)
 /** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
  *
  * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
@@ -142,9 +145,55 @@ void main(void)
     }
 #endif // HAS_BIAS
 }
-#endif // IM2COL_REDUCED
+#else /* IM2COL_REDUCED_GENERIC */
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+void main(void)
+{
+    uvec3    pos            = uvec3(gl_GlobalInvocationID.xyz);
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Vector   dst            = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst);
+#if defined(IM2COL_REDUCED_8X)
+    uint     tmp_out_offset = dst.current_offset + ((pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+    uvec4    tmp;
+    LOAD1(tmp, src, src.current_offset >> uint(4));
+    STORE1(dst, tmp_out_offset >> uint(4), tmp);
+#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
+    uint  tmp_out_offset = dst.current_offset + ((pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+    uvec2 tmp;
+    LOAD1(tmp, src, src.current_offset >> uint(3));
+    STORE1(dst, tmp_out_offset >> uint(3), tmp);
+#else                            /* IM2COL_REDUCED_8X */
+    uint tmp_out_offset = dst.current_offset + ((pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+    uint tmp;
+    LOAD1(tmp, src, src.current_offset >> uint(2));
+    STORE1(dst, tmp_out_offset >> uint(2), tmp);
+#endif                           /* IM2COL_REDUCED_8X */
+}
+#endif                           /* IM2COL_REDUCED_GENERIC */
+#endif                           // IM2COL_REDUCED
 
 #elif defined(DATA_TYPE_FP32)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, restrict);
 
 #ifdef IM2COL_GENERIC
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
author	Frank Lei <frank.lei@arm.com>	2017-12-05 10:43:33 +0800
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:42:17 +0000
commit	b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f (patch)
tree	89a4b81430100a4a91902d5987ae42edc438012c /src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
parent	397d58aa40b02a26923c34d8cd4ba274eac45963 (diff)
download	ComputeLibrary-b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f.tar.gz