COMPMID-939 Fix mismatches and finalize CLSoftmaxLayer optimization

Change-Id: I4404f91a270e0ba7bbb7451c4c43a485fd4a3f6c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/121105 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Giorgio Arena <giorgio.arena@arm.com> 2018-02-19 15:33:41 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:47:18 +0000
commit: 72f39be2f372b9a810cb27320dba5d0722407549 (patch)
tree: 3ac73795c23107f7eab88cec553be9b9ed248214 /src/core/CL/cl_kernels/softmax_layer_quantized.cl
parent: 7bfb199c1461fd553a78ca1947855c7d25106803 (diff)
download: ComputeLibrary-72f39be2f372b9a810cb27320dba5d0722407549.tar.gz
1 files changed, 3 insertions, 4 deletions
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
index 7521c8e1ee..c055381fc5 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
@@ -232,7 +232,6 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
  *
  * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -288,7 +287,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     __local uchar max_local;
 
     uchar4 uchar_min   = (uchar4)0;
-    uchar4 max_val_vec = (uchar4)uchar_min;
+    uchar4 max_val_vec = uchar_min;
 
     // Number of elements per work-item.
     const uint row = width / GRID_SIZE;
@@ -441,8 +440,8 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
         int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
         data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
         data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
-        uchar4 widx         = convert_uchar4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
-        data                = select(uchar_min, data, widx);
+        int4 widx           = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
+        data_fp             = select(0, data_fp, widx);
         vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));
         sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
     }
author	Giorgio Arena <giorgio.arena@arm.com>	2018-02-19 15:33:41 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:47:18 +0000
commit	72f39be2f372b9a810cb27320dba5d0722407549 (patch)
tree	3ac73795c23107f7eab88cec553be9b9ed248214 /src/core/CL/cl_kernels/softmax_layer_quantized.cl
parent	7bfb199c1461fd553a78ca1947855c7d25106803 (diff)
download	ComputeLibrary-72f39be2f372b9a810cb27320dba5d0722407549.tar.gz