From ec2afd695381d90eb3aabf4a61059ff313f9a0d2 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Tue, 14 Nov 2023 16:23:14 +0000 Subject: Fix device issue with CL softmax * Performing the second pass in reverse order doesn't seem to work reliably in some specific devices. This patch introduces another approach to workaround the device issue. Resolves: COMPMID-6669 Signed-off-by: Viet-Hoa Do Change-Id: I591f05ff06f8439ebe4d32093441ae871a292f4c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10730 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: SiCong Li Comments-Addressed: Arm Jenkins --- src/core/CL/cl_kernels/common/softmax_layer.cl | 31 ++++++++++++++++---------- 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'src/core') diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl index 58c458982d..bfc0995bb8 100644 --- a/src/core/CL/cl_kernels/common/softmax_layer.cl +++ b/src/core/CL/cl_kernels/common/softmax_layer.cl @@ -272,6 +272,22 @@ __kernel void softmax_non_x( dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0; tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0; + // In case of processing quantized data, i.e. DATA_TYPE is smaller than TMP_DATA_TYPE: + // + // In the first pass (finding max), the quantized data is copied from the input tensor to the temporary tensor. + // Dequantization is not needed to find the max value and since dequantization widens the data, we defer it + // to the second pass pass to reduce memory bandwidth of the first pass. + // + // In the second pass, it reads the quantized data from the temporary tensor and writes the dequantized data + // back to the temporary tensor. + // + // To avoid dequantized data overwritting the unprocessed quantized data in the temporary tensor, + // this extra offset is introduced to store the quantized data at the end of the temporary tensor. + // + // Note: Another approach is to perform the second pass in reverse order, but for unexplanable reason + // it doesn't work in some devices. + uint tmp_extra_offset = LENGTH * VEC_SIZE * (sizeof(TMP_DATA_TYPE) - sizeof(DATA_TYPE)); + // Calculate max value and store the input data to the temporary tensor in suitable format. VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) max_value = MIN_VALUE; int i = 0; @@ -282,7 +298,7 @@ __kernel void softmax_non_x( max_value = max(max_value, data); - VSTORE(VEC_SIZE)(data, 0, (__global DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(DATA_TYPE))); + VSTORE(VEC_SIZE)(data, 0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE))); } // Regularize the data. @@ -296,18 +312,9 @@ __kernel void softmax_non_x( # define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA) #endif // IS_QUANTIZED - for (i = LENGTH - 1; i >= 0; --i) + for (i = 0; i < LENGTH; ++i) { - // In case of processing quantized data, i.e. DATA_TYPE is smaller than TMP_DATA_TYPE: - // - // In the first pass (finding max), the quantized data is copied from the input tensor to the temporary tensor. - // Dequantization is not needed to find the max value and since dequantization widens the data, we defer it - // to the second pass pass to reduce memory bandwidth of the first pass. - // - // This pass reads the quantized data from the temporary tensor and writes the dequantized data - // back to the temporary tensor, hence we need to loop in reverse to avoid overwriting unprocessed data. - - VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE)); + VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE)); data = REGULARIZE(data); -- cgit v1.2.1