aboutsummaryrefslogtreecommitdiff
path: root/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
diff options
context:
space:
mode:
authorJoel Liang <joel.liang@arm.com>2017-11-10 09:59:19 +0800
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commitf1f3ebd517089e934cf3f06e64d90619a395ad87 (patch)
tree8dac05909b5f522a1c78e0ac4423cb6f65254391 /src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
parent283c1790da45ab562ecfb2aa7741297191886d85 (diff)
downloadComputeLibrary-f1f3ebd517089e934cf3f06e64d90619a395ad87.tar.gz
APPBROWSER-298, APPBROWSER-306: Reimplement the common code of compute shader
The new common code of compute shader is in file helpers_cs.h Rewrite the direct_convolution1x1.cs and softmax_layer.cs to use the new common code. It will also remove the dependence of the token pasting operator (##). We'll remove the "##" support after we rewrite all of the compute shader code. Change-Id: Icd8553ef6b61ad484a8507590ac8ed499bd47061 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95455 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Frank Lei <frank.lei@arm.com> (cherry picked from commit 0a4f83570d261f839d9866b68979efe8d7a95883) Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95601 Reviewed-by: Jim He <jim.he@arm.com>
Diffstat (limited to 'src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs')
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs596
1 files changed, 198 insertions, 398 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
index 0bbabeaafc..1a2c3f7b20 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
@@ -24,99 +24,60 @@
layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
+#include "helpers_cs.h"
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+// Common definitions
#define MAX_OP(x, y) max((x), (y))
#define ADD_OP(x, y) ((x) + (y))
#define SUB_OP(x, y) ((x) - (y))
#define DIV_OP(x, y) ((x) / (y))
#define EXP_OP(x) exp((x))
-#if defined(DATA_TYPE_FP32)
-const float MINVAL = -1.0 / 0.0;
-vec4 type_min = CONVERT(MINVAL, vec4);
-
-#define LOAD16(name, offset) \
- vec4(LOAD4(name, offset), \
- LOAD4(name, offset + uint(1)), \
- LOAD4(name, offset + uint(2)), \
- LOAD4(name, offset + uint(3)))
-
-#define STORE16(name, offset, value) \
- STORE4(name, offset, value.x); \
- STORE4(name, offset + uint(1), value.y); \
- STORE4(name, offset + uint(2), value.z); \
- STORE4(name, offset + uint(3), value.w)
+const float float_min = -1.0 / 0.0;
+const vec4 vec4_min = vec4(float_min);
#ifdef SOFTMAX_LAYER_MAX
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(max, 2, float, readonly);
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-BUFFER_DECLARATION(sum, 4, float, writeonly);
-#elif defined(SOFTMAX_LAYER_NORM)
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(sum, 2, float, readonly);
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-#endif // SOFTMAX_LAYER_MAX
-layout(std140) uniform shader_params
-{
-#ifdef SOFTMAX_LAYER_MAX
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- uint width;
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(max);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(sum);
- uint width;
-#elif defined(SOFTMAX_LAYER_NORM)
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(sum);
- TENSOR3D_PARAM_DECLARATION(dst);
-#endif // SOFTMAX_LAYER_MAX
-};
-
-#ifdef SOFTMAX_LAYER_MAX
/** Identifies the maximum value across the 1st dimension.
*
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] width Input image width
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in] src_attrs The attributes of the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_attrs The attributes of the destination tensor
+ * @param[in] width Input image width
*/
+SHADER_PARAMS_DECLARATION
+{
+ Tensor3DAttributes src_attrs;
+ Tensor3DAttributes dst_attrs;
+ uint width;
+};
+
+#if defined(DATA_TYPE_FP32)
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
void main(void)
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
// Initialize local maximum
- vec4 max_val = CONVERT(type_min, vec4);
+ vec4 max_val = vec4_min;
// Calculate max of row
uint width2 = width >> 2;
for(int i = 0; i < int(width2); i++)
{
- vec4 data = LOAD16(src, offset(src, i << 2, 0));
+ vec4 data = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
max_val = MAX_OP(data, max_val);
}
@@ -124,7 +85,7 @@ void main(void)
// Handle non multiple of 4
for(int i = int(width2 << 2); i < int(width); i++)
{
- float data = LOAD4(src, offset(src, i, 0));
+ float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
max_val.x = MAX_OP(data, max_val.x);
}
#endif /* NON_MULTIPLE_OF_4 */
@@ -134,408 +95,247 @@ void main(void)
max_val.x = MAX_OP(max_val.x, max_val.y);
// Store result
- STORE4(dst, CURRENT_OFFSET(dst), max_val.x);
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, max_val.x);
}
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
- *
- * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
- *
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
- * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
- * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
- * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in] width Input image width
- */
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+
void main(void)
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
- Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
- // Load max value of 1D logits vector (row)
- vec4 max_val = CONVERT(LOAD4(max, CURRENT_OFFSET(max)), vec4);
-
- // Set sum vector
- vec4 sum1D = CONVERT(0, vec4);
+ // Initialize local maximum
+ vec4 max_val = vec4_min;
- // Shift values, exp and sum
+ // Calculate max of row
uint width2 = width >> 2;
for(int i = 0; i < int(width2); i++)
{
- vec4 data = LOAD16(src, offset(src, i << 2, 0));
- data = SUB_OP(data, max_val);
- data = EXP_OP(data);
- STORE16(dst, offset(dst, i << 2, 0), data);
- sum1D = ADD_OP(sum1D, data);
+ vec4 data = VLOAD2_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
+ max_val = MAX_OP(data, max_val);
}
#ifdef NON_MULTIPLE_OF_4
// Handle non multiple of 4
- for(int i = int(width2 << 2); i < int(width); i++)
+ for(int i = int(width2 << 2); i < int(width); i = i + 2)
{
- float data;
- data = LOAD4(src, offset(src, i, 0));
- data = SUB_OP(data, max_val.x);
- data = EXP_OP(data);
- STORE4(dst, offset(dst, i, 0), data);
- sum1D.x = ADD_OP(sum1D.x, data);
+ vec2 data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+ max_val.x = MAX_OP(data.x, max_val.x);
+ if((i + 1) < int(width))
+ {
+ max_val.x = MAX_OP(data.y, max_val.x);
+ }
}
-#endif /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_4 */
- // Perform min/max reduction
- sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
- sum1D.x = ADD_OP(sum1D.x, sum1D.y);
+ // Perform max reduction
+ max_val.xy = MAX_OP(max_val.xy, max_val.zw);
+ max_val.x = MAX_OP(max_val.x, max_val.y);
- // Calculate and store result
- STORE4(sum, CURRENT_OFFSET(sum), sum1D.x);
+ STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, max_val.xy);
}
-#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+#else // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
*
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in] src_attrs The attributes of the source tensor
+ * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] max_attrs The attributes of the max values tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_attrs The attributes of the destination tensor
+ * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] sum_attrs The attributes of the sum values tensor
+ * @param[in] width Input image width
*/
-void main(void)
-{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
- // Load max value of 1D logits vector (row)
- vec4 sum_val = CONVERT(LOAD4(sum, offset(sum, 0, int(gl_GlobalInvocationID.y))), vec4);
- vec4 data = LOAD16(src, CURRENT_OFFSET(src));
- STORE16(dst, CURRENT_OFFSET(dst), DIV_OP(data, sum_val));
-}
-#endif // SOFTMAX_LAYER_MAX
-
-#elif defined(DATA_TYPE_FP16)
-precision mediump float;
-
-const float MINVAL1 = -1.0 / 0.0;
-vec4 type_min1 = CONVERT(MINVAL1, vec4);
-
-#define GC_LOAD4_IMAGE(r, name, x, y) \
- load_and_unpack(r.xy, name, x, y); \
- load_and_unpack(r.zw, name, (x + 2), y)
-
-#define GC_STORE4_IMAGE(r, name, x, y) \
- GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.xy)), name, x, y); \
- GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.zw)), name, (x + 2), y)
-
-#ifdef SOFTMAX_LAYER_MAX
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(dst, 2, uint, writeonly);
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(max, 2, uint, readonly);
-BUFFER_DECLARATION(dst, 3, uint, writeonly);
-BUFFER_DECLARATION(sum, 4, uint, writeonly);
-#elif defined(SOFTMAX_LAYER_NORM)
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(sum, 2, uint, readonly);
-BUFFER_DECLARATION(dst, 3, uint, writeonly);
-#endif // SOFTMAX_LAYER_MAX
-
-layout(std140) uniform shader_params
+SHADER_PARAMS_DECLARATION
{
-#ifdef SOFTMAX_LAYER_MAX
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- uint width;
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(max);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(sum);
- uint width;
-#elif defined(SOFTMAX_LAYER_NORM)
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(sum);
- TENSOR3D_PARAM_DECLARATION(dst);
-#endif // SOFTMAX_LAYER_MAX
+ Tensor3DAttributes src_attrs;
+ Tensor3DAttributes max_attrs;
+ Tensor3DAttributes dst_attrs;
+ Tensor3DAttributes sum_attrs;
+ uint width;
};
+#if defined(DATA_TYPE_FP32)
-#define load_and_unpack(rs, names, xs, ys) \
- do \
- { \
- uint packed_s; \
- GC_LOAD1_2D_OFFSET(packed_s, names, xs, ys); \
- rs = vec2(unpackHalf2x16(packed_s)); \
- } while(false)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, maxBuffer, float, max_ptr, max_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(4, sumBuffer, float, sum_ptr, sum_shift, 2, writeonly);
-#ifdef SOFTMAX_LAYER_MAX
-/** Identifies the maximum value across the 1st dimension.
- *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
- *
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] width Input image width
- */
void main(void)
{
- Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+ ImageIterator max_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(max_attrs, max_shift);
+ ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(sum_attrs, sum_shift);
- // Initialize local maximum
- vec4 max_val1 = CONVERT(type_min1, vec4);
+ // Load max value of 1D logits vector (row)
+ vec4 max_val = vec4(LOAD_CURRENT_ITEM(max_ptr, max_iter));
- // Calculate max of row
+ // Set sum vector
+ vec4 sum1D = vec4(0);
+
+ // Shift values, exp and sum
uint width2 = width >> 2;
for(int i = 0; i < int(width2); i++)
{
- vec4 data1;
- GC_LOAD4_IMAGE(data1, src, (i << 2), 0);
- max_val1 = MAX_OP(data1, max_val1);
+ vec4 data = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
+ data = SUB_OP(data, max_val);
+ data = EXP_OP(data);
+ VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, i << 2, 0), data);
+ sum1D = ADD_OP(sum1D, data);
}
#ifdef NON_MULTIPLE_OF_4
// Handle non multiple of 4
- for(int i = int(width2 << 2); i < int(width); i = i + 2)
+ for(int i = int(width2 << 2); i < int(width); i++)
{
- vec2 data;
- load_and_unpack(data, src, i, 0);
- max_val1.x = MAX_OP(data.x, max_val1.x);
- if((i + 1) < int(width))
- {
- max_val1.x = MAX_OP(data.y, max_val1.x);
- }
+ float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+ data = SUB_OP(data, max_val.x);
+ data = EXP_OP(data);
+ STORE(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), data);
+ sum1D.x = ADD_OP(sum1D.x, data);
}
-#endif /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_4 */
- // Perform max reduction
- max_val1.xy = MAX_OP(max_val1.xy, max_val1.zw);
- max_val1.x = MAX_OP(max_val1.x, max_val1.y);
- vec2 res1 = vec2(max_val1.x, 0.f);
- uint res;
- res = uint(packHalf2x16(res1));
+ // Perform min/max reduction
+ sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
+ sum1D.x = ADD_OP(sum1D.x, sum1D.y);
- // Store result
- GC_STORE1_2D_OFFSET(res, dst, 0, 0);
+ // Calculate and store result
+ STORE_CURRENT_ITEM(sum_ptr, sum_iter, sum1D.x);
}
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
- *
- * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
- *
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
- * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
- * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
- * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in] width Input image width
- */
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, maxBuffer, uint, max_ptr, max_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(4, sumBuffer, uint, sum_ptr, sum_shift, 2, writeonly);
+
void main(void)
{
- Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Image max = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
- Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+ ImageIterator max_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(max_attrs, max_shift);
+ ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(sum_attrs, sum_shift);
// Load max value of 1D logits vector (row)
- vec2 datamaxinit;
- load_and_unpack(datamaxinit, max, 0, 0);
- vec4 max_val = CONVERT(datamaxinit.x, vec4);
+ vec2 datamaxinit = LOAD_UNPACK2_CURRENT_ITEM_HALF(max_ptr, max_iter);
+ vec4 max_val = vec4(datamaxinit.x);
// Set sum vector
- vec4 sum1D1 = CONVERT(0.f, vec4);
+ vec4 sum1D = vec4(0.f);
// Shift values, exp and sum
uint width2 = width >> 2;
for(int i = 0; i < int(width2); i++)
{
- vec4 data;
- GC_LOAD4_IMAGE(data, src, (i << 2), 0);
- data = SUB_OP(data, max_val);
- data = EXP_OP(data);
- GC_STORE4_IMAGE(data, dst, (i << 2), 0);
- sum1D1 = ADD_OP(sum1D1, data);
+ vec4 data = VLOAD2_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
+ data = SUB_OP(data, max_val);
+ data = EXP_OP(data);
+ VSTORE2_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i << 2, 0), data);
+ sum1D = ADD_OP(sum1D, data);
}
#ifdef NON_MULTIPLE_OF_4
// Handle non multiple of 4
for(int i = int(width2 << 2); i < int(width); i = i + 2)
{
- vec2 datamiddle;
- float data1;
- load_and_unpack(datamiddle, src, i, 0);
- data1 = SUB_OP(datamiddle.x, max_val.x);
- data1 = EXP_OP(data1);
- vec2 datares1;
+ float data;
+ vec2 datamiddle = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+ data = SUB_OP(datamiddle.x, max_val.x);
+ data = EXP_OP(data);
+ vec2 datares;
if((i + 1) < int(width))
{
float data2;
- data2 = SUB_OP(datamiddle.y, max_val.x);
- data2 = EXP_OP(data2);
- datares1 = vec2(data1, data2);
- data1 = ADD_OP(data2, data1);
+ data2 = SUB_OP(datamiddle.y, max_val.x);
+ data2 = EXP_OP(data2);
+ datares = vec2(data, data2);
+ data = ADD_OP(data2, data);
}
else
{
- datares1 = vec2(data1, 0.f);
+ datares = vec2(data, 0.f);
}
- uint datares;
- datares = uint(packHalf2x16(datares1));
- GC_STORE1_2D_OFFSET(datares, dst, i, 0);
- sum1D1.x = ADD_OP(sum1D1.x, data1);
+
+ STORE_PACK2_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), datares);
+
+ sum1D.x = ADD_OP(sum1D.x, data);
}
-#endif /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_4 */
// Perform min/max reduction
- sum1D1.xy = ADD_OP(sum1D1.xy, sum1D1.zw);
- sum1D1.x = ADD_OP(sum1D1.x, sum1D1.y);
- vec2 res1 = vec2(sum1D1.x, 0.f);
- uint res;
- res = uint(packHalf2x16(res1));
+ sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
+ sum1D.x = ADD_OP(sum1D.x, sum1D.y);
+
// Calculate and store result
- GC_STORE1_2D_OFFSET(res, sum, 0, 0);
+ STORE_PACK2_CURRENT_ITEM_HALF(sum_ptr, sum_iter, sum1D.xy);
}
-#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX
+#else // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#elif defined(SOFTMAX_LAYER_NORM)
+
/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
*
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in] src_attrs The attributes of the source tensor
+ * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] sum_attrs The attributes of the sum values tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_attrs The attributes of the destination tensor
*/
+SHADER_PARAMS_DECLARATION
+{
+ Tensor3DAttributes src_attrs;
+ Tensor3DAttributes sum_attrs;
+ Tensor3DAttributes dst_attrs;
+};
+#if defined(DATA_TYPE_FP32)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, sumBuffer, float, sum_ptr, sum_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
void main(void)
{
- Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+ ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(sum_attrs, sum_shift);
// Load max value of 1D logits vector (row)
- vec2 sum1;
- load_and_unpack(sum1, sum, 0, int(gl_GlobalInvocationID.y));
- vec4 sum_val1 = CONVERT(sum1.x, vec4);
-
- vec4 data1;
- GC_LOAD4_IMAGE(data1, src, 0, 0);
- vec4 res = DIV_OP(data1, sum_val1);
- GC_STORE4_IMAGE(res, dst, 0, 0);
+ vec4 sum_val = vec4(LOAD(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)));
+ vec4 data = VLOAD4_CURRENT_ITEM(vec4, src_ptr, src_iter);
+ VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, DIV_OP(data, sum_val));
}
-#endif // SOFTMAX_LAYER_MAX
-#endif // DATA_TYPE_FP32 \ No newline at end of file
+#elif defined(DATA_TYPE_FP16)
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, sumBuffer, uint, sum_ptr, sum_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+void main(void)
+{
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+ ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(sum_attrs, sum_shift);
+
+ // Load max value of 1D logits vector (row)
+ vec4 sum_val = vec4(LOAD_UNPACK2_HALF(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)).x);
+ vec4 data = VLOAD2_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+ VSTORE2_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, DIV_OP(data, sum_val));
+}
+#else // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#endif // SOFTMAX_LAYER_MAX