aboutsummaryrefslogtreecommitdiff
path: root/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
diff options
context:
space:
mode:
authorJoel Liang <joel.liang@arm.com>2017-11-10 09:59:19 +0800
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commitf1f3ebd517089e934cf3f06e64d90619a395ad87 (patch)
tree8dac05909b5f522a1c78e0ac4423cb6f65254391 /src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
parent283c1790da45ab562ecfb2aa7741297191886d85 (diff)
downloadComputeLibrary-f1f3ebd517089e934cf3f06e64d90619a395ad87.tar.gz
APPBROWSER-298, APPBROWSER-306: Reimplement the common code of compute shader
The new common code of compute shader is in file helpers_cs.h Rewrite the direct_convolution1x1.cs and softmax_layer.cs to use the new common code. It will also remove the dependence of the token pasting operator (##). We'll remove the "##" support after we rewrite all of the compute shader code. Change-Id: Icd8553ef6b61ad484a8507590ac8ed499bd47061 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95455 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Frank Lei <frank.lei@arm.com> (cherry picked from commit 0a4f83570d261f839d9866b68979efe8d7a95883) Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95601 Reviewed-by: Jim He <jim.he@arm.com>
Diffstat (limited to 'src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs')
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs220
1 files changed, 72 insertions, 148 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
index 3a31cb80a7..071c1858bc 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
@@ -24,107 +24,88 @@
layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
+#include "helpers_cs.h"
-layout(std140) uniform shader_params
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_attrs The attributes of the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_attrs The attributes of the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_attrs The attributes of the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_attrs The attributes of the weights tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+SHADER_PARAMS_DECLARATION
{
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(weights);
+ Tensor3DAttributes src_attrs;
+ Tensor3DAttributes dst_attrs;
+ Tensor3DAttributes weights_attrs;
#ifdef BIAS
- VECTOR_PARAM_DECLARATION(biases);
+ VectorAttributes biases_attrs;
#endif /* BIAS */
uint weights_stride_w;
uint weights_depth;
};
#if defined(DATA_TYPE_FP32)
-precision highp float;
-
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
#ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
#endif /* BIAS */
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
- */
void main()
{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
#ifdef BIAS
- Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+ VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
#endif /* BIAS */
- float pixels = CONVERT(0, float);
+ float pixels = 0.f;
uint z_index = gl_GlobalInvocationID.z;
- weights.current_offset += z_index * weights_stride_w >> 2;
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
float temp;
float temp_weight;
-
for(int d = 0; d < int(weights_depth); ++d)
{
- temp = LOAD4(src, CURRENT_OFFSET(src));
- temp_weight = LOAD4(weights, CURRENT_OFFSET(weights));
+ temp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+ temp_weight = LOAD_CURRENT_ITEM(weights_ptr, weights_iter);
pixels += temp * temp_weight;
- src.current_offset += (src_stride_z >> 2);
- weights.current_offset += (weights_stride_z >> 2);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
}
#ifdef BIAS
- pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+ pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
#endif /* BIAS */
- STORE4(dst, CURRENT_OFFSET(dst), pixels);
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
}
#elif defined(DATA_TYPE_FP16)
-precision mediump float;
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
#ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
#endif /* BIAS */
#if STRIDE_X == 2
@@ -135,15 +116,10 @@ BUFFER_DECLARATION(biases, 4, uint, readonly);
#error STRIDE_X larger than 2 is not supported
#endif /* STRIDE_X == 2 */
-vec4[2] convolve_stride1(Image src, float w)
+vec4[2] convolve_stride1(ImageIterator src_iter, float w)
{
- uvec4 packed_s;
- vec4 s[2];
-
- GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
-
- s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
- s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+ vec4 s[2];
+ s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
s[0] *= w;
s[1] *= w;
@@ -151,22 +127,14 @@ vec4[2] convolve_stride1(Image src, float w)
return s;
}
-vec4[2] convolve_stride2(Image src, float w)
+vec4[2] convolve_stride2(ImageIterator src_iter, float w)
{
- uvec4 packed_s;
- vec4 s[2];
- vec4 r[2];
-
- GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
- s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
- s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+ vec4 s[2];
+ vec4 r[2];
+ s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
r[0] = vec4(s[0].xz, s[1].xz);
-
- GC_LOAD1_2D_OFFSET(packed_s, src, 8, 0);
- s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
- s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
-
+ s = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 8, 0));
r[1] = vec4(s[0].xz, s[1].xz);
r[0] *= w;
@@ -175,51 +143,14 @@ vec4[2] convolve_stride2(Image src, float w)
return r;
}
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
- */
void main()
{
- Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
- Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+ ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
#ifdef BIAS
- Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+ VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
#endif /* BIAS */
vec4 pixels[2];
@@ -227,48 +158,41 @@ void main()
pixels[1] = vec4(0.f);
uint z_index = gl_GlobalInvocationID.z;
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
- weights.current_offset += z_index * weights_stride_w;
-
- uint packed_w;
float w;
-
for(int d = 0; d < int(weights_depth); ++d)
{
- GC_LOAD1_3D_OFFSET(packed_w, weights, 0, 0, 0);
- w = unpackHalf2x16(packed_w).x;
+ w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
- vec4 r[2] = CONVOLVE(src, w);
+ vec4 r[2] = CONVOLVE(src_iter, w);
pixels[0] += r[0];
pixels[1] += r[1];
- src.current_offset += src_stride_z;
- weights.current_offset += weights_stride_z;
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
}
#ifdef BIAS
- uint packed_b;
+ vec2 vec2_b;
float b;
- GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
+ vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
if(z_index % uint(2) == uint(0))
{
- b = unpackHalf2x16(packed_b).x;
+ b = vec2_b.x;
}
else
{
- b = unpackHalf2x16(packed_b).y;
+ b = vec2_b.y;
}
- pixels[0] += vec4(b);
- pixels[1] += vec4(b);
+ pixels[0] += b;
+ pixels[1] += b;
#endif /* BIAS */
- uvec4 packed_d;
- packed_d = uvec4(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw),
- packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
- GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+ STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
}
#else /* DATA_TYPE_FP32 */
#error Data type not supported