aboutsummaryrefslogtreecommitdiff
path: root/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
diff options
context:
space:
mode:
authorFrank Lei <frank.lei@arm.com>2017-12-05 10:43:33 +0800
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:42:17 +0000
commitb9d38ee6378f3035f8dbad442223d3d9e2f3dc4f (patch)
tree89a4b81430100a4a91902d5987ae42edc438012c /src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
parent397d58aa40b02a26923c34d8cd4ba274eac45963 (diff)
downloadComputeLibrary-b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f.tar.gz
APPBROWSER-312 Fully connected performance optimization
Change-Id: Ie93fd630ebbad7b6ca8812cb5044b3f1908b45fd Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111830 Reviewed-by: Stephen Li <stephen.li@arm.com> Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs')
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs67
1 files changed, 58 insertions, 9 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 1a0c9f1d30..87a109adc0 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
@@ -25,14 +25,6 @@
layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
#include "helpers.h"
-#ifdef DATA_TYPE_FP16
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(dst, 2, uint, restrict);
-#else // DATA_TYPE_FP16
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, restrict);
-#endif // DATA_TYPE_FP16
-
layout(std140) uniform shader_params
{
#ifdef IM2COL_GENERIC
@@ -58,10 +50,21 @@ layout(std140) uniform shader_params
};
#ifdef DATA_TYPE_FP16
+#if defined(IM2COL_REDUCED_8X)
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, restrict);
+#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, restrict);
+#else /* IM2COL_REDUCED_8X */
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, restrict);
+#endif /* IM2COL_REDUCED_8X */
precision mediump float;
#ifdef IM2COL_REDUCED
+#if defined(IM2COL_REDUCED_GENERIC)
/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
*
* @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
@@ -142,9 +145,55 @@ void main(void)
}
#endif // HAS_BIAS
}
-#endif // IM2COL_REDUCED
+#else /* IM2COL_REDUCED_GENERIC */
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] width The width of the input tensor
+ * @param[in] height The height of the input tensor
+ */
+void main(void)
+{
+ uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+ Vector dst = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst);
+#if defined(IM2COL_REDUCED_8X)
+ uint tmp_out_offset = dst.current_offset + ((pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+ uvec4 tmp;
+ LOAD1(tmp, src, src.current_offset >> uint(4));
+ STORE1(dst, tmp_out_offset >> uint(4), tmp);
+#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
+ uint tmp_out_offset = dst.current_offset + ((pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+ uvec2 tmp;
+ LOAD1(tmp, src, src.current_offset >> uint(3));
+ STORE1(dst, tmp_out_offset >> uint(3), tmp);
+#else /* IM2COL_REDUCED_8X */
+ uint tmp_out_offset = dst.current_offset + ((pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+ uint tmp;
+ LOAD1(tmp, src, src.current_offset >> uint(2));
+ STORE1(dst, tmp_out_offset >> uint(2), tmp);
+#endif /* IM2COL_REDUCED_8X */
+}
+#endif /* IM2COL_REDUCED_GENERIC */
+#endif // IM2COL_REDUCED
#elif defined(DATA_TYPE_FP32)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, restrict);
#ifdef IM2COL_GENERIC
/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.