Fix device issue with CL softmax

* Performing the second pass in reverse order doesn't seem to work reliably in some specific devices. This patch introduces another approach to workaround the device issue. Resolves: COMPMID-6669 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: I591f05ff06f8439ebe4d32093441ae871a292f4c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10730 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Viet-Hoa Do <viet-hoa.do@arm.com> 2023-11-14 16:23:14 +0000
committer: Viet-Hoa Do <viet-hoa.do@arm.com> 2023-11-15 10:18:37 +0000
commit: ec2afd695381d90eb3aabf4a61059ff313f9a0d2 (patch)
tree: e46cc26bd470c22bc87d91452e523b4b8ebe2b1d /src/core
parent: c63f8b048041bc32c1ece31d8faf7a7780f3a6b7 (diff)
download: ComputeLibrary-ec2afd695381d90eb3aabf4a61059ff313f9a0d2.tar.gz
1 files changed, 19 insertions, 12 deletions
diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
index 58c458982d..bfc0995bb8 100644
--- a/src/core/CL/cl_kernels/common/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
@@ -272,6 +272,22 @@ __kernel void softmax_non_x(
     dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
     tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
 
+    // In case of processing quantized data, i.e. DATA_TYPE is smaller than TMP_DATA_TYPE:
+    //
+    // In the first pass (finding max), the quantized data is copied from the input tensor to the temporary tensor.
+    // Dequantization is not needed to find the max value and since dequantization widens the data, we defer it
+    // to the second pass pass to reduce memory bandwidth of the first pass.
+    //
+    // In the second pass, it reads the quantized data from the temporary tensor and writes the dequantized data
+    // back to the temporary tensor.
+    //
+    // To avoid dequantized data overwritting the unprocessed quantized data in the temporary tensor,
+    // this extra offset is introduced to store the quantized data at the end of the temporary tensor.
+    //
+    // Note: Another approach is to perform the second pass in reverse order, but for unexplanable reason
+    // it doesn't work in some devices.
+    uint tmp_extra_offset = LENGTH * VEC_SIZE * (sizeof(TMP_DATA_TYPE) - sizeof(DATA_TYPE));
+
     // Calculate max value and store the input data to the temporary tensor in suitable format.
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) max_value = MIN_VALUE;
     int i = 0;
@@ -282,7 +298,7 @@ __kernel void softmax_non_x(
 
         max_value = max(max_value, data);
 
-        VSTORE(VEC_SIZE)(data, 0, (__global DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(DATA_TYPE)));
+        VSTORE(VEC_SIZE)(data, 0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE)));
     }
 
     // Regularize the data.
@@ -296,18 +312,9 @@ __kernel void softmax_non_x(
 # define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
 #endif // IS_QUANTIZED
 
-    for (i = LENGTH - 1; i >= 0; --i)
+    for (i = 0; i < LENGTH; ++i)
     {
-        // In case of processing quantized data, i.e. DATA_TYPE is smaller than TMP_DATA_TYPE:
-        //
-        // In the first pass (finding max), the quantized data is copied from the input tensor to the temporary tensor.
-        // Dequantization is not needed to find the max value and since dequantization widens the data, we defer it
-        // to the second pass pass to reduce memory bandwidth of the first pass.
-        //
-        // This pass reads the quantized data from the temporary tensor and writes the dequantized data
-        // back to the temporary tensor, hence we need to loop in reverse to avoid overwriting unprocessed data.
-
-        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
 
         data = REGULARIZE(data);
author	Viet-Hoa Do <viet-hoa.do@arm.com>	2023-11-14 16:23:14 +0000
committer	Viet-Hoa Do <viet-hoa.do@arm.com>	2023-11-15 10:18:37 +0000
commit	ec2afd695381d90eb3aabf4a61059ff313f9a0d2 (patch)
tree	e46cc26bd470c22bc87d91452e523b4b8ebe2b1d /src/core
parent	c63f8b048041bc32c1ece31d8faf7a7780f3a6b7 (diff)
download	ComputeLibrary-ec2afd695381d90eb3aabf4a61059ff313f9a0d2.tar.gz