aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/common/softmax_layer.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CL/cl_kernels/common/softmax_layer.cl')
-rw-r--r--src/core/CL/cl_kernels/common/softmax_layer.cl747
1 files changed, 290 insertions, 457 deletions
diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
index 4d2d89dd73..58c458982d 100644
--- a/src/core/CL/cl_kernels/common/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,511 +21,344 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
+#define MIN_VALUE_float -FLT_MAX
+#define MIN_VALUE_half -HALF_MAX
+#define MIN_VALUE_char CHAR_MIN
+#define MIN_VALUE_uchar 0
+
+#define MIN_VALUE_TYPE_STR(data_type) MIN_VALUE_##data_type
+#define MIN_VALUE_TYPE(data_type) MIN_VALUE_TYPE_STR(data_type)
+#define MIN_VALUE MIN_VALUE_TYPE(DATA_TYPE)
+
+#ifdef SOFTMAX_X
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+/** 3-pass softmax in the x dimension.
*
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * List of preprocessors:
+ * - DATA_TYPE: the input/output data type.
+ * - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ * If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ * - IS_LOG (optional): indicating whether this is log softmax.
+ * - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ * - BETA: the beta coefficient.
+ * - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ * - VEC_SIZE: the size of the vector.
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ * - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ * - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
+ *
+ * @param[in] src_ptr Pointer to the source tensor.
+ * @param[in] src_stride_0 Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1 Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2 Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr Pointer to the destination tensor.
+ * @param[in] dst_stride_0 Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1 Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2 Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0 Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1 Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2 Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
*/
-__kernel void softmax_layer_norm(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(sum),
- TENSOR3D_DECLARATION(dst))
+__kernel void softmax_x(
+ __global uchar *src_ptr,
+ uint src_stride_0,
+ uint src_stride_1,
+ uint src_stride_2,
+ uint src_offset_first_element,
+
+ __global uchar *dst_ptr,
+ uint dst_stride_0,
+ uint dst_stride_1,
+ uint dst_stride_2,
+ uint dst_offset_first_element
+
+#ifdef IS_QUANTIZED
+ ,
+ __global uchar *tmp_ptr,
+ uint tmp_stride_0,
+ uint tmp_stride_1,
+ uint tmp_stride_2,
+ uint tmp_offset_first_element
+#endif // IS_QUANTIZED
+)
{
- const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);
+ const int dim_0 = get_global_id(0);
+ const int dim_1 = get_global_id(1);
+ const int dim_2 = get_global_id(2);
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+ src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+ dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
- Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+#ifdef IS_QUANTIZED
+ tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
+#else // IS_QUANTIZED
+ __global uchar *tmp_ptr = dst_ptr;
+#endif // IS_QUANTIZED
- // Load max value of 1D logits vector (row)
- DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+ // Calculate max value.
+ DATA_TYPE max_value = MIN_VALUE;
+ int i = 0;
-#if defined(LOG_SOFTMAX)
- sum_val = log(sum_val);
- data0 -= sum_val;
-#else // defined(LOG_SOFTMAX)
- data0 /= sum_val;
-#endif // defined(LOG_SOFTMAX)
+ for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)));
- STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
+ max_value = max(max_value, MAX_REDUCE(data, VEC_SIZE));
+ }
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+ for (; i < LENGTH; ++i)
+ {
+ DATA_TYPE data = *(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE));
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
+ max_value = max(max_value, data);
+ }
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+ // Regularize the data.
+ TMP_DATA_TYPE sum_value = 0;
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
- *
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
- * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
- * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
- * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_serial(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(maxo),
- TENSOR3D_DECLARATION(dst),
- TENSOR3D_DECLARATION(sum))
-{
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+#ifdef IS_QUANTIZED
+ TMP_DATA_TYPE max_value_f = (CONVERT(max_value, TMP_DATA_TYPE) - SRC_OFFSET) * SRC_SCALE;
+ TMP_DATA_TYPE regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
- Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
- Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+ for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+ {
+ VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
-#ifdef BETA
- // Initialize beta
- VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
+ data = REGULARIZE(data);
- // Initialize local maximum
- VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
+#ifdef IS_LOG
+ sum_value += SUM_REDUCE(exp(data), VEC_SIZE);
+#else // IS_LOG
+ data = exp(data);
+ sum_value += SUM_REDUCE(data, VEC_SIZE);
+#endif // IS_LOG
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
- SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
- max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+ VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
+ }
- for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+ for (; i < LENGTH; ++i)
{
- VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
- max_val_vec = max(data, max_val_vec);
+ TMP_DATA_TYPE data = CONVERT(*(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)), TMP_DATA_TYPE);
+
+ data = REGULARIZE(data);
+
+#ifdef IS_LOG
+ sum_value += exp(data);
+#else // IS_LOG
+ data = exp(data);
+ sum_value += data;
+#endif // IS_LOG
+
+ *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)) = data;
}
- // Perform max reduction
- DATA_TYPE max_val = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
- *((__global DATA_TYPE *)maxo.ptr) = max_val;
-
- /* Second section */
-
- // Set sum vector
- VEC_TYPE sum1D = 0;
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- data -= max_val;
-#ifdef BETA
- data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
- VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
- (data, 0, (__global DATA_TYPE *)dst_addr);
- data = exp(data);
- data = select(0, data, widx);
-#else /* LOG_SOFTMAX */
- data = exp(data);
- data = select(0, data, widx);
- VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
- (data, 0, (__global DATA_TYPE *)dst_addr);
-#endif /* LOG_SOFTMAX */
- sum1D += data;
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
- // Shift values, exp and sum
- for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+#undef REGULARIZE
+
+ // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+ TMP_DATA_TYPE norm_offset = -log(sum_value) + DST_OFFSET;
+# define NORMALIZE(SIZE, x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, SIZE), rte)
+# else // IS_LOG
+ TMP_DATA_TYPE norm_div = sum_value * DST_SCALE;
+# define NORMALIZE(SIZE, x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, SIZE))
+# endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+# define NORMALIZE(SIZE, x) ((x) - log(sum_value))
+# else // IS_LOG
+# define NORMALIZE(SIZE, x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+ for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
{
- VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
- data -= max_val;
-#ifdef BETA
- data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
- data = exp(data);
-#else /* LOG_SOFTMAX */
- data = exp(data);
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
- sum1D += data;
+ VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result = NORMALIZE(VEC_SIZE, data);
+
+ VSTORE(VEC_SIZE)(result, 0, (__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)));
}
- // Perform sum reduction
- *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
+ for (; i < LENGTH; ++i)
+ {
+ TMP_DATA_TYPE data = *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE));
+
+ DATA_TYPE result = NORMALIZE(1, data);
+
+ *(__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)) = result;
+ }
+
+#undef NORMALIZE
}
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
+#endif // SOFTMAX_X
+
+#ifdef SOFTMAX_NON_X
+
+/** 3-pass softmax in any dimension higher than the x dimension.
+ *
+ * List of preprocessors:
+ * - DATA_TYPE: the input/output data type.
+ * - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ * If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ * - IS_LOG (optional): indicating whether this is log softmax.
+ * - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ * - BETA: the beta coefficient.
+ * - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ * - VEC_SIZE: the size of the vector.
+ * - VEC_SIZE_LEFTOVER: the size of the leftover part.
*
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ * - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ * - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
- * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
- * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
- * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in] src_ptr Pointer to the source tensor.
+ * @param[in] src_stride_0 Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1 Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2 Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr Pointer to the destination tensor.
+ * @param[in] dst_stride_0 Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1 Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2 Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0 Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1 Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2 Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
*/
-__kernel void softmax_layer_max_shift_exp_sum_parallel(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(maxo),
- TENSOR3D_DECLARATION(dst),
- TENSOR3D_DECLARATION(sum))
+__kernel void softmax_non_x(
+ __global uchar *src_ptr,
+ uint src_stride_0,
+ uint src_stride_1,
+ uint src_stride_2,
+ uint src_offset_first_element,
+
+ __global uchar *dst_ptr,
+ uint dst_stride_0,
+ uint dst_stride_1,
+ uint dst_stride_2,
+ uint dst_offset_first_element,
+
+ __global uchar *tmp_ptr,
+ uint tmp_stride_0,
+ uint tmp_stride_1,
+ uint tmp_stride_2,
+ uint tmp_offset_first_element,
+
+ uint src_stride_axis,
+ uint dst_stride_axis
+)
{
- const uint lid = get_local_id(0);
- const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);
+ const int dim_0 = max((int)get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE, 0);
+ const int dim_1 = get_global_id(1);
+ const int dim_2 = get_global_id(2);
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+ src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+ dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
+ tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
- Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
- Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+ // Calculate max value and store the input data to the temporary tensor in suitable format.
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) max_value = MIN_VALUE;
+ int i = 0;
-#ifdef BETA
- // Initialize beta
- VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
-
- // Define one temporary vector per work-item.
- __local VEC_TYPE tmp_local[GRID_SIZE];
- __local DATA_TYPE max_local;
-
- VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
-
- // Number of iterations per work-item.
- const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
- // Calculate max of row
- uint i = 0;
- for(; i < width; ++i)
- {
- VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
- max_val_vec = max(data_max, max_val_vec);
- }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
- // How many work-items needed to complete the computation.
- int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
- if(lid < boundary_workitems)
+ for (i = 0; i < LENGTH; ++i)
{
- VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
- max_val_vec = max(data_max, max_val_vec);
- }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- SELECT_TYPE widx;
- if(lid == 0)
- {
- // Handle non multiple of 4
- VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
- widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
- max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));
- }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
- tmp_local[lid] = max_val_vec;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * src_stride_axis));
- barrier(CLK_LOCAL_MEM_FENCE);
+ max_value = max(max_value, data);
- if(GRID_SIZE >= 256)
- {
- if(lid < 128)
- {
- tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
+ VSTORE(VEC_SIZE)(data, 0, (__global DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(DATA_TYPE)));
}
- if(GRID_SIZE >= 128)
- {
- if(lid < 64)
- {
- tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 64)
- {
- if(lid < 32)
- {
- tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 32)
- {
- if(lid < 16)
- {
- tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 16)
- {
- if(lid < 8)
- {
- tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 8)
- {
- if(lid < 4)
- {
- tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 4)
- {
- if(lid < 2)
- {
- tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(lid == 0)
- {
- max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
- max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- /* Second section */
+ // Regularize the data.
+ VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) sum_value = 0;
- // Set sum vector
- VEC_TYPE sum1D = 0;
- DATA_TYPE max_val = max_local;
+#ifdef IS_QUANTIZED
+ VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) max_value_f = (CONVERT(max_value, VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE)) - SRC_OFFSET) * SRC_SCALE;
+ VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
- // Shift values, exp and sum
- for(i = 0; i < width; ++i)
- {
- VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
- data -= max_val;
-#ifdef BETA
- data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
- data = exp(data);
-#else /* LOG_SOFTMAX */
- data = exp(data);
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
- sum1D += data;
- }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
- boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
- if(lid < boundary_workitems)
+ for (i = LENGTH - 1; i >= 0; --i)
{
- VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
- data -= max_val;
-#ifdef BETA
- data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
- data = exp(data);
-#else /* LOG_SOFTMAX */
+ // In case of processing quantized data, i.e. DATA_TYPE is smaller than TMP_DATA_TYPE:
+ //
+ // In the first pass (finding max), the quantized data is copied from the input tensor to the temporary tensor.
+ // Dequantization is not needed to find the max value and since dequantization widens the data, we defer it
+ // to the second pass pass to reduce memory bandwidth of the first pass.
+ //
+ // This pass reads the quantized data from the temporary tensor and writes the dequantized data
+ // back to the temporary tensor, hence we need to loop in reverse to avoid overwriting unprocessed data.
+
+ VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
+
+ data = REGULARIZE(data);
+
+#ifdef IS_LOG
+ sum_value += exp(data);
+#else // IS_LOG
data = exp(data);
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
- sum1D += data;
+ sum_value += data;
+#endif // IS_LOG
+
+ VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
}
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- if(lid == 0)
+
+#undef REGULARIZE
+
+ // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+ VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_offset = -log(sum_value) + DST_OFFSET;
+# define NORMALIZE(x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE), rte)
+# else // IS_LOG
+ VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_div = sum_value * DST_SCALE;
+# define NORMALIZE(x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, VEC_SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
+# endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+# define NORMALIZE(x) ((x) - log(sum_value))
+# else // IS_LOG
+# define NORMALIZE(x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+ for (i = 0; i < LENGTH; ++i)
{
- // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
- VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
- data -= max_val;
-#ifdef BETA
- data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
- VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
- (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
- data = exp(data);
- data = select(0, data, widx);
-#else /* LOG_SOFTMAX */
- data = exp(data);
- data = select(0, data, widx);
- VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
- (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
- sum1D += data;
- }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
- tmp_local[lid] = sum1D;
+ VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
- barrier(CLK_LOCAL_MEM_FENCE);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result0 = NORMALIZE(data);
- if(GRID_SIZE >= 256)
- {
- if(lid < 128)
- {
- tmp_local[lid] += tmp_local[lid + 128];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 128)
- {
- if(lid < 64)
- {
- tmp_local[lid] += tmp_local[lid + 64];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 64)
- {
- if(lid < 32)
- {
- tmp_local[lid] += tmp_local[lid + 32];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 32)
- {
- if(lid < 16)
- {
- tmp_local[lid] += tmp_local[lid + 16];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 16)
- {
- if(lid < 8)
- {
- tmp_local[lid] += tmp_local[lid + 8];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 8)
- {
- if(lid < 4)
- {
- tmp_local[lid] += tmp_local[lid + 4];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(GRID_SIZE >= 4)
- {
- if(lid < 2)
- {
- tmp_local[lid] += tmp_local[lid + 2];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(lid == 0)
- {
- sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
- // Perform sum reduction
- *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
+ STORE_VECTOR_SELECT(result, DATA_TYPE, dst_ptr + i * dst_stride_axis, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
+
+#undef NORMALIZE
}
-#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
-#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) \ No newline at end of file
+#endif // SOFTMAX_NON_X
+
+#undef MIN_VALUE
+#undef MIN_VALUE_TYPE
+#undef MIN_VALUE_TYPE_STR
+
+#undef MIN_VALUE_float
+#undef MIN_VALUE_half
+#undef MIN_VALUE_char
+#undef MIN_VALUE_uchar