diff options
Diffstat (limited to 'src/core/CL/cl_kernels/softmax_layer_quantized.cl')
-rw-r--r-- | src/core/CL/cl_kernels/softmax_layer_quantized.cl | 7 |
1 files changed, 3 insertions, 4 deletions
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl index 7521c8e1ee..c055381fc5 100644 --- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl +++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl @@ -232,7 +232,6 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_serial( * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed. - * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0). * * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -288,7 +287,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel( __local uchar max_local; uchar4 uchar_min = (uchar4)0; - uchar4 max_val_vec = (uchar4)uchar_min; + uchar4 max_val_vec = uchar_min; // Number of elements per work-item. const uint row = width / GRID_SIZE; @@ -441,8 +440,8 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel( int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff); data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4); data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4); - uchar4 widx = convert_uchar4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width); - data = select(uchar_min, data, widx); + int4 widx = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width); + data_fp = select(0, data_fp, widx); vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0)); sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN)); } |