From 2d1a835b68eb27a800838fc2b563b12eddf2c19f Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Mon, 26 Oct 2020 15:04:08 +0000 Subject: COMPMID-3735 Remove OpenCL padding: CLSoftmaxLayerKernel - Renamed SELECT_DATA_TYPE to SELECT_VEC_DATA_TYPE to reflect its usage with vectors. SELECT_DATA_TYPE(dt) will now return the primitive data type - Changed the interface of VEC_OFFS and V_OFFS in order to receive the primitive data type as a parameter rather than its vector form - Performed a general cleanup of the kernels, such as creating macro for sum and max reduces, remove reduntant macros, defines, variables, calculations, etc... - Using VEC_SIZE and VEC_SIZE_LEFTOVER in every kernel in order to allow computation for smaller shapes without adding paddings - Removed the actual padding from the kernel and adjusting its calculations accordingly. Added asserts for padding removal checks. Removed invalid Validate tests. Change-Id: If5ccbd5d34e255d38c7f6bfe8740e2b80b28e264 Signed-off-by: Giorgio Arena Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4277 Comments-Addressed: Arm Jenkins Reviewed-by: SiCong Li Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/softmax_layer.cl | 415 +++++++++++++------------------- 1 file changed, 168 insertions(+), 247 deletions(-) (limited to 'src/core/CL/cl_kernels/softmax_layer.cl') diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl index 77dbb47e41..01f5de47cf 100644 --- a/src/core/CL/cl_kernels/softmax_layer.cl +++ b/src/core/CL/cl_kernels/softmax_layer.cl @@ -23,55 +23,15 @@ */ #include "helpers.h" -#define MAX_OP(x, y, type, size) max((x), (y)) -#define ADD_OP(x, y, type, size) ((x) + (y)) -#define SUB_OP(x, y, type, size) ((x) - (y)) -#define MUL_OP(x, y, type, size) ((x) * (y)) -#define DIV_OP(x, y, type, size) ((x) / (y)) -#define EXP_OP(x, type, size) exp((x)) - -#ifdef USE_F16 -#define MINVAL -HALF_MAX -#define SELECT_DATA_TYPE short -#else /* USE_F16 */ -#define MINVAL -FLT_MAX -#define SELECT_DATA_TYPE int -#endif /* USE_F16 */ - -/* Number of workitems in dimension 0. */ -#if !defined(GRID_SIZE) -#define GRID_SIZE 1 -#endif /* !defined(GRID_SIZE) */ - -/* Vector size, i.e. number of vector elements. */ -#if VECTOR_SIZE == 2 -__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL); -__constant uint2 idx__ = (uint2)(0, 1); - -#elif VECTOR_SIZE == 4 -__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL); -__constant uint4 idx__ = (uint4)(0, 1, 2, 3); - -#elif VECTOR_SIZE == 8 -__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL); -__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7); - -#else /* VECTOR_SIZE DEFAULT */ -#define VECTOR_SIZE 16 -#define LOG_VECTOR_SIZE 4 -__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL); -__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - -#endif /* VECTOR_SIZE END */ - -// TODO (COMPMID-661): Remove if the non-fused kernels are removed -__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL); -__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -__constant uint4 idx4 = (uint4)(0, 1, 2, 3); +#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) /** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel. * - * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float + * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0 + * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16 + * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE + * @note In case of log softmax, -DLOG_SOFTMAX must be passed. * * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -103,28 +63,49 @@ __kernel void softmax_layer_norm( TENSOR3D_DECLARATION(sum), TENSOR3D_DECLARATION(dst)) { - Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z; + Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum); // Load max value of 1D logits vector (row) DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1))); - VEC_DATA_TYPE(DATA_TYPE, 16) - data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)); -#ifdef LOG_SOFTMAX + VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) + data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr); + +#if defined(LOG_SOFTMAX) sum_val = log(sum_val); - vstore16(SUB_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0)); -#else /* LOG_SOFTMAX */ - vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0)); -#endif /* LOG_SOFTMAX */ + data0 -= sum_val; +#else // defined(LOG_SOFTMAX) + data0 /= sum_val; +#endif // defined(LOG_SOFTMAX) + + STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } +#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL) + +/* Number of workitems in dimension 0. */ +#if !defined(GRID_SIZE) +#define GRID_SIZE 1 +#endif /* !defined(GRID_SIZE) */ + +#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) +#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) + /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value, * then gets the exponent of each element as sums all elements across each row. * - * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float + * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0 + * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16 + * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed. * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0). + * @note In case of log softmax, -DLOG_SOFTMAX must be passed. + * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX * * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -158,136 +139,102 @@ __kernel void softmax_layer_norm( * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes) * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor - * @param[in] width Input image width */ __kernel void softmax_layer_max_shift_exp_sum_serial( TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(maxo), TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(sum), - uint width) + TENSOR3D_DECLARATION(sum)) { - Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z; + Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo); Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum); #ifdef BETA // Initialize beta - VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) - beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA; + VEC_TYPE beta = (VEC_TYPE)BETA; #endif /* BETA */ // Initialize local maximum - VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) - max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_; - - // Calculate max of row - const uint width_ = width >> LOG_VECTOR_SIZE; - for(uint i = 0; i < width_; i++) - { - VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) - data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0)); - max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE); - } + VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL); #ifdef NON_MULTIPLE_OF_VECTOR_SIZE - VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) - data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0)); - VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE) - widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)); - max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE); + VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr); + SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE); + max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx)); #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */ + for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE) + { + VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE))); + max_val_vec = max(data, max_val_vec); + } + // Perform max reduction -#if VECTOR_SIZE == 16 - max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8); -#endif /* VECTOR SIZE 16 END */ -#if VECTOR_SIZE >= 8 - max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4); -#endif /* VECTOR SIZE 8 END */ -#if VECTOR_SIZE >= 4 - max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2); -#endif /* VECTOR SIZE 4 END */ - max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1); - // Store result - *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0; + DATA_TYPE max_val = MAX_REDUCE(max_val_vec, VECTOR_SIZE); + *((__global DATA_TYPE *)maxo.ptr) = max_val; /* Second section */ - // Load max value of 1D logits vector (row) - DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0)); - // Set sum vector - VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) - sum1D = 0; + VEC_TYPE sum1D = 0; - // Shift values, exp and sum - for(uint i = 0; i < width_; i++) - { - VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) - data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0)); - data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE); +#ifdef NON_MULTIPLE_OF_VECTOR_SIZE + data -= max_val; #ifdef BETA - data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE); + data *= beta; #endif /* BETA */ #ifdef LOG_SOFTMAX - VSTORE(VECTOR_SIZE) - (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0)); - data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE); + VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER) + (data, 0, (__global DATA_TYPE *)dst_addr); + data = exp(data); + data = select(0, data, widx); #else /* LOG_SOFTMAX */ - data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE); - VSTORE(VECTOR_SIZE) - (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0)); + data = exp(data); + data = select(0, data, widx); + VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER) + (data, 0, (__global DATA_TYPE *)dst_addr); #endif /* LOG_SOFTMAX */ - sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE); - } + sum1D += data; +#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */ -#ifdef NON_MULTIPLE_OF_VECTOR_SIZE - VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) - data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0)); - data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE); + // Shift values, exp and sum + for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE) + { + VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE))); + data -= max_val; #ifdef BETA - data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE); + data *= beta; #endif /* BETA */ #ifdef LOG_SOFTMAX - VSTORE(VECTOR_SIZE) - (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0)); - data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE); - widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)); - data = select(0, data, widx); + VSTORE(VECTOR_SIZE) + (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE))); + data = exp(data); #else /* LOG_SOFTMAX */ - data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE); - widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)); - data = select(0, data, widx); - VSTORE(VECTOR_SIZE) - (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0)); + data = exp(data); + VSTORE(VECTOR_SIZE) + (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE))); #endif /* LOG_SOFTMAX */ - sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE); -#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */ + sum1D += data; + } // Perform sum reduction -#if VECTOR_SIZE == 16 - sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8); -#endif /* VECTOR SIZE 16 END */ -#if VECTOR_SIZE >= 8 - sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4); -#endif /* VECTOR SIZE 8 END */ -#if VECTOR_SIZE >= 4 - sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2); -#endif /* VECTOR SIZE 4 END */ - sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1); - - // Calculate and store result - *((__global DATA_TYPE *)sum.ptr) = sum1D.s0; + *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE); } /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value, * then gets the exponent of each element as sums all elements across each row. * - * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float + * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0 + * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16 + * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed. * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0). + * @note In case of log softmax, -DLOG_SOFTMAX must be passed. + * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX * * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -321,71 +268,59 @@ __kernel void softmax_layer_max_shift_exp_sum_serial( * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes) * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor - * @param[in] width Input image width */ __kernel void softmax_layer_max_shift_exp_sum_parallel( TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(maxo), TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(sum), - uint width) + TENSOR3D_DECLARATION(sum)) { - Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + const uint lid = get_local_id(0); + const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z; + Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo); Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum); - const uint lid = get_local_id(0); - #ifdef BETA // Initialize beta - VEC_DATA_TYPE(DATA_TYPE, 4) - beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA; + VEC_TYPE beta = (VEC_TYPE)BETA; #endif /* BETA */ // Define one temporary vector per work-item. - __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE]; + __local VEC_TYPE tmp_local[GRID_SIZE]; __local DATA_TYPE max_local; - __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL); - VEC_DATA_TYPE(DATA_TYPE, 4) - max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4; - // Number of elements per work-item. - const uint row = width / GRID_SIZE; + VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL); + // Number of iterations per work-item. - const uint width_ = row >> 2; + const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE; // Calculate max of row uint i = 0; - for(; i < width_; i++) + for(; i < width; ++i) { - VEC_DATA_TYPE(DATA_TYPE, 4) - data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0)); - max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4); + VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); + max_val_vec = max(data_max, max_val_vec); } #ifdef NON_MULTIPLE_OF_GRID_SIZE // How many work-items needed to complete the computation. //TODO: Optimize this calculation (avoid %). - int boundary_workitems = (width % (GRID_SIZE * 4)) / 4; + int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE; if(lid < boundary_workitems) { - VEC_DATA_TYPE(DATA_TYPE, 4) - data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0)); - max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4); + VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); + max_val_vec = max(data_max, max_val_vec); } #ifdef NON_MULTIPLE_OF_VECTOR_SIZE - if(boundary_workitems == 0) - { - boundary_workitems = GRID_SIZE; - i--; - } - if(lid == (boundary_workitems - 1)) + SELECT_TYPE widx; + if(lid == 0) { // Handle non multiple of 4 - VEC_DATA_TYPE(DATA_TYPE, 4) - data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0)); - VEC_DATA_TYPE(SELECT_DATA_TYPE, 4) - widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)); - max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4); + VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE))); + widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE); + max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx)); } #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */ #endif /* NON_MULTIPLE_OF_GRID_SIZE */ @@ -397,7 +332,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 128) { - tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]); } barrier(CLK_LOCAL_MEM_FENCE); } @@ -405,7 +340,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 64) { - tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]); } barrier(CLK_LOCAL_MEM_FENCE); } @@ -413,7 +348,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 32) { - tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]); } barrier(CLK_LOCAL_MEM_FENCE); } @@ -421,7 +356,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 16) { - tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]); } barrier(CLK_LOCAL_MEM_FENCE); } @@ -429,7 +364,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 8) { - tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]); } barrier(CLK_LOCAL_MEM_FENCE); } @@ -437,7 +372,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 4) { - tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]); } barrier(CLK_LOCAL_MEM_FENCE); } @@ -445,99 +380,84 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 2) { - tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]); } barrier(CLK_LOCAL_MEM_FENCE); } if(lid == 0) { - max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4); - max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2); - max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1); - max_local = max_val_vec.s0; + max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]); + max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE); } barrier(CLK_LOCAL_MEM_FENCE); /* Second section */ // Set sum vector - VEC_DATA_TYPE(DATA_TYPE, 4) - sum1D = 0; + VEC_TYPE sum1D = 0; DATA_TYPE max_val = max_local; // Shift values, exp and sum - for(i = 0; i < width_; i++) + for(i = 0; i < width; ++i) { - VEC_DATA_TYPE(DATA_TYPE, 4) - data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0)); - data = SUB_OP(data, max_val, DATA_TYPE, 4); + VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); + data -= max_val; #ifdef BETA - data = MUL_OP(data, beta, DATA_TYPE, 4); + data *= beta; #endif /* BETA */ #ifdef LOG_SOFTMAX - VSTORE(4) - (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0)); - data = EXP_OP(data, DATA_TYPE, 4); + VSTORE(VECTOR_SIZE) + (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); + data = exp(data); #else /* LOG_SOFTMAX */ - data = EXP_OP(data, DATA_TYPE, 4); - VSTORE(4) - (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0)); + data = exp(data); + VSTORE(VECTOR_SIZE) + (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); #endif /* LOG_SOFTMAX */ - sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4); + sum1D += data; } #ifdef NON_MULTIPLE_OF_GRID_SIZE //TODO: Optimize the calculation (avoid %). - boundary_workitems = (width % (GRID_SIZE * 4)) / 4; + boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE; if(lid < boundary_workitems) { - VEC_DATA_TYPE(DATA_TYPE, 4) - data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0)); - data = SUB_OP(data, max_val, DATA_TYPE, 4); + VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); + data -= max_val; #ifdef BETA - data = MUL_OP(data, beta, DATA_TYPE, 4); + data *= beta; #endif /* BETA */ #ifdef LOG_SOFTMAX - VSTORE(4) - (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0)); - data = EXP_OP(data, DATA_TYPE, 4); + VSTORE(VECTOR_SIZE) + (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); + data = exp(data); #else /* LOG_SOFTMAX */ - data = EXP_OP(data, DATA_TYPE, 4); - VSTORE(4) - (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0)); + data = exp(data); + VSTORE(VECTOR_SIZE) + (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); #endif /* LOG_SOFTMAX */ - sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4); + sum1D += data; } #ifdef NON_MULTIPLE_OF_VECTOR_SIZE - if(boundary_workitems == 0) - { - boundary_workitems = GRID_SIZE; - i--; - } - if(lid == (boundary_workitems - 1)) + if(lid == 0) { // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride - VEC_DATA_TYPE(DATA_TYPE, 4) - data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0)); - data = SUB_OP(data, max_val, DATA_TYPE, 4); + VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE))); + data -= max_val; #ifdef BETA - data = MUL_OP(data, beta, DATA_TYPE, 4); + data *= beta; #endif /* BETA */ #ifdef LOG_SOFTMAX - VSTORE(4) - (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0)); - data = EXP_OP(data, DATA_TYPE, 4); - VEC_DATA_TYPE(SELECT_DATA_TYPE, 4) - widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)); + VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER) + (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE))); + data = exp(data); data = select(0, data, widx); #else /* LOG_SOFTMAX */ - data = EXP_OP(data, DATA_TYPE, 4); - VEC_DATA_TYPE(SELECT_DATA_TYPE, 4) - widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)); + data = exp(data); data = select(0, data, widx); - VSTORE(4) - (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0)); + VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER) + (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE))); #endif /* LOG_SOFTMAX */ - sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4); + sum1D += data; } #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */ #endif /* NON_MULTIPLE_OF_GRID_SIZE */ @@ -549,7 +469,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 128) { - tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] += tmp_local[lid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); } @@ -557,7 +477,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 64) { - tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] += tmp_local[lid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); } @@ -565,7 +485,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 32) { - tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] += tmp_local[lid + 32]; } barrier(CLK_LOCAL_MEM_FENCE); } @@ -573,7 +493,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 16) { - tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] += tmp_local[lid + 16]; } barrier(CLK_LOCAL_MEM_FENCE); } @@ -581,7 +501,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 8) { - tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] += tmp_local[lid + 8]; } barrier(CLK_LOCAL_MEM_FENCE); } @@ -589,7 +509,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 4) { - tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] += tmp_local[lid + 4]; } barrier(CLK_LOCAL_MEM_FENCE); } @@ -597,16 +517,17 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel( { if(lid < 2) { - tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4); + tmp_local[lid] += tmp_local[lid + 2]; } barrier(CLK_LOCAL_MEM_FENCE); } if(lid == 0) { - sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4); - // Perform max reduction - sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2); - sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1); - *((__global DATA_TYPE *)sum.ptr) = sum1D.s0; + sum1D = (tmp_local[lid + 1] + tmp_local[lid]); + // Perform sum reduction + *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE); } } + +#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL) +#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) \ No newline at end of file -- cgit v1.2.1