aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/softmax_layer.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CL/cl_kernels/softmax_layer.cl')
-rw-r--r--src/core/CL/cl_kernels/softmax_layer.cl415
1 files changed, 168 insertions, 247 deletions
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 77dbb47e41..01f5de47cf 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -23,55 +23,15 @@
*/
#include "helpers.h"
-#define MAX_OP(x, y, type, size) max((x), (y))
-#define ADD_OP(x, y, type, size) ((x) + (y))
-#define SUB_OP(x, y, type, size) ((x) - (y))
-#define MUL_OP(x, y, type, size) ((x) * (y))
-#define DIV_OP(x, y, type, size) ((x) / (y))
-#define EXP_OP(x, type, size) exp((x))
-
-#ifdef USE_F16
-#define MINVAL -HALF_MAX
-#define SELECT_DATA_TYPE short
-#else /* USE_F16 */
-#define MINVAL -FLT_MAX
-#define SELECT_DATA_TYPE int
-#endif /* USE_F16 */
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-/* Vector size, i.e. number of vector elements. */
-#if VECTOR_SIZE == 2
-__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
-__constant uint2 idx__ = (uint2)(0, 1);
-
-#elif VECTOR_SIZE == 4
-__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
-__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
-
-#elif VECTOR_SIZE == 8
-__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
-__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
-
-#else /* VECTOR_SIZE DEFAULT */
-#define VECTOR_SIZE 16
-#define LOG_VECTOR_SIZE 4
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
-__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
-#endif /* VECTOR_SIZE END */
-
-// TODO (COMPMID-661): Remove if the non-fused kernels are removed
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
-__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-__constant uint4 idx4 = (uint4)(0, 1, 2, 3);
+#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
*
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
*
* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -103,28 +63,49 @@ __kernel void softmax_layer_norm(
TENSOR3D_DECLARATION(sum),
TENSOR3D_DECLARATION(dst))
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
// Load max value of 1D logits vector (row)
DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
- VEC_DATA_TYPE(DATA_TYPE, 16)
- data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
-#ifdef LOG_SOFTMAX
+ VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+ data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(LOG_SOFTMAX)
sum_val = log(sum_val);
- vstore16(SUB_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
-#else /* LOG_SOFTMAX */
- vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
-#endif /* LOG_SOFTMAX */
+ data0 -= sum_val;
+#else // defined(LOG_SOFTMAX)
+ data0 /= sum_val;
+#endif // defined(LOG_SOFTMAX)
+
+ STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
+#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
* then gets the exponent of each element as sums all elements across each row.
*
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
*
* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -158,136 +139,102 @@ __kernel void softmax_layer_norm(
* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in] width Input image width
*/
__kernel void softmax_layer_max_shift_exp_sum_serial(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(maxo),
TENSOR3D_DECLARATION(dst),
- TENSOR3D_DECLARATION(sum),
- uint width)
+ TENSOR3D_DECLARATION(sum))
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
#ifdef BETA
// Initialize beta
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA;
+ VEC_TYPE beta = (VEC_TYPE)BETA;
#endif /* BETA */
// Initialize local maximum
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
-
- // Calculate max of row
- const uint width_ = width >> LOG_VECTOR_SIZE;
- for(uint i = 0; i < width_; i++)
- {
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
- max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
- }
+ VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
- VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
- widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
- max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+ SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
+ max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));
#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+ for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+ {
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+ max_val_vec = max(data, max_val_vec);
+ }
+
// Perform max reduction
-#if VECTOR_SIZE == 16
- max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
-#endif /* VECTOR SIZE 16 END */
-#if VECTOR_SIZE >= 8
- max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
-#endif /* VECTOR SIZE 8 END */
-#if VECTOR_SIZE >= 4
- max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
-#endif /* VECTOR SIZE 4 END */
- max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
- // Store result
- *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
+ DATA_TYPE max_val = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
+ *((__global DATA_TYPE *)maxo.ptr) = max_val;
/* Second section */
- // Load max value of 1D logits vector (row)
- DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
-
// Set sum vector
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- sum1D = 0;
+ VEC_TYPE sum1D = 0;
- // Shift values, exp and sum
- for(uint i = 0; i < width_; i++)
- {
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
- data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+ data -= max_val;
#ifdef BETA
- data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+ data *= beta;
#endif /* BETA */
#ifdef LOG_SOFTMAX
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
- data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+ VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+ (data, 0, (__global DATA_TYPE *)dst_addr);
+ data = exp(data);
+ data = select(0, data, widx);
#else /* LOG_SOFTMAX */
- data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+ data = exp(data);
+ data = select(0, data, widx);
+ VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+ (data, 0, (__global DATA_TYPE *)dst_addr);
#endif /* LOG_SOFTMAX */
- sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
- }
+ sum1D += data;
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
- data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+ // Shift values, exp and sum
+ for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+ {
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+ data -= max_val;
#ifdef BETA
- data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+ data *= beta;
#endif /* BETA */
#ifdef LOG_SOFTMAX
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
- data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
- widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
- data = select(0, data, widx);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
+ data = exp(data);
#else /* LOG_SOFTMAX */
- data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
- widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
- data = select(0, data, widx);
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
+ data = exp(data);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
#endif /* LOG_SOFTMAX */
- sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+ sum1D += data;
+ }
// Perform sum reduction
-#if VECTOR_SIZE == 16
- sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
-#endif /* VECTOR SIZE 16 END */
-#if VECTOR_SIZE >= 8
- sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
-#endif /* VECTOR SIZE 8 END */
-#if VECTOR_SIZE >= 4
- sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
-#endif /* VECTOR SIZE 4 END */
- sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
-
- // Calculate and store result
- *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+ *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
}
/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
* then gets the exponent of each element as sums all elements across each row.
*
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
*
* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -321,71 +268,59 @@ __kernel void softmax_layer_max_shift_exp_sum_serial(
* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in] width Input image width
*/
__kernel void softmax_layer_max_shift_exp_sum_parallel(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(maxo),
TENSOR3D_DECLARATION(dst),
- TENSOR3D_DECLARATION(sum),
- uint width)
+ TENSOR3D_DECLARATION(sum))
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ const uint lid = get_local_id(0);
+ const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
- const uint lid = get_local_id(0);
-
#ifdef BETA
// Initialize beta
- VEC_DATA_TYPE(DATA_TYPE, 4)
- beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
+ VEC_TYPE beta = (VEC_TYPE)BETA;
#endif /* BETA */
// Define one temporary vector per work-item.
- __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
+ __local VEC_TYPE tmp_local[GRID_SIZE];
__local DATA_TYPE max_local;
- __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
- VEC_DATA_TYPE(DATA_TYPE, 4)
- max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
- // Number of elements per work-item.
- const uint row = width / GRID_SIZE;
+ VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
+
// Number of iterations per work-item.
- const uint width_ = row >> 2;
+ const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
// Calculate max of row
uint i = 0;
- for(; i < width_; i++)
+ for(; i < width; ++i)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+ VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ max_val_vec = max(data_max, max_val_vec);
}
#ifdef NON_MULTIPLE_OF_GRID_SIZE
// How many work-items needed to complete the computation.
//TODO: Optimize this calculation (avoid %).
- int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+ int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
if(lid < boundary_workitems)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+ VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ max_val_vec = max(data_max, max_val_vec);
}
#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- if(boundary_workitems == 0)
- {
- boundary_workitems = GRID_SIZE;
- i--;
- }
- if(lid == (boundary_workitems - 1))
+ SELECT_TYPE widx;
+ if(lid == 0)
{
// Handle non multiple of 4
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
- VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
- widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
- max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
+ VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+ widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
+ max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));
}
#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
#endif /* NON_MULTIPLE_OF_GRID_SIZE */
@@ -397,7 +332,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 128)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -405,7 +340,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 64)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -413,7 +348,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 32)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -421,7 +356,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 16)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -429,7 +364,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 8)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -437,7 +372,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 4)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -445,99 +380,84 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 2)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0)
{
- max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
- max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
- max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
- max_local = max_val_vec.s0;
+ max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
+ max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
}
barrier(CLK_LOCAL_MEM_FENCE);
/* Second section */
// Set sum vector
- VEC_DATA_TYPE(DATA_TYPE, 4)
- sum1D = 0;
+ VEC_TYPE sum1D = 0;
DATA_TYPE max_val = max_local;
// Shift values, exp and sum
- for(i = 0; i < width_; i++)
+ for(i = 0; i < width; ++i)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- data = SUB_OP(data, max_val, DATA_TYPE, 4);
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ data -= max_val;
#ifdef BETA
- data = MUL_OP(data, beta, DATA_TYPE, 4);
+ data *= beta;
#endif /* BETA */
#ifdef LOG_SOFTMAX
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
- data = EXP_OP(data, DATA_TYPE, 4);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ data = exp(data);
#else /* LOG_SOFTMAX */
- data = EXP_OP(data, DATA_TYPE, 4);
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+ data = exp(data);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
#endif /* LOG_SOFTMAX */
- sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+ sum1D += data;
}
#ifdef NON_MULTIPLE_OF_GRID_SIZE
//TODO: Optimize the calculation (avoid %).
- boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+ boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
if(lid < boundary_workitems)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- data = SUB_OP(data, max_val, DATA_TYPE, 4);
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ data -= max_val;
#ifdef BETA
- data = MUL_OP(data, beta, DATA_TYPE, 4);
+ data *= beta;
#endif /* BETA */
#ifdef LOG_SOFTMAX
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
- data = EXP_OP(data, DATA_TYPE, 4);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ data = exp(data);
#else /* LOG_SOFTMAX */
- data = EXP_OP(data, DATA_TYPE, 4);
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+ data = exp(data);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
#endif /* LOG_SOFTMAX */
- sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+ sum1D += data;
}
#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- if(boundary_workitems == 0)
- {
- boundary_workitems = GRID_SIZE;
- i--;
- }
- if(lid == (boundary_workitems - 1))
+ if(lid == 0)
{
// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
- data = SUB_OP(data, max_val, DATA_TYPE, 4);
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+ data -= max_val;
#ifdef BETA
- data = MUL_OP(data, beta, DATA_TYPE, 4);
+ data *= beta;
#endif /* BETA */
#ifdef LOG_SOFTMAX
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
- data = EXP_OP(data, DATA_TYPE, 4);
- VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
- widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+ VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+ (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+ data = exp(data);
data = select(0, data, widx);
#else /* LOG_SOFTMAX */
- data = EXP_OP(data, DATA_TYPE, 4);
- VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
- widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+ data = exp(data);
data = select(0, data, widx);
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
+ VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+ (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
#endif /* LOG_SOFTMAX */
- sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+ sum1D += data;
}
#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
#endif /* NON_MULTIPLE_OF_GRID_SIZE */
@@ -549,7 +469,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 128)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -557,7 +477,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 64)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -565,7 +485,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 32)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -573,7 +493,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 16)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -581,7 +501,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 8)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -589,7 +509,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 4)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -597,16 +517,17 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 2)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0)
{
- sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
- // Perform max reduction
- sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
- sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
- *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+ sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
+ // Perform sum reduction
+ *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
}
}
+
+#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) \ No newline at end of file