From b0608065aa3359b41ddc83dcb66adf489006fcd2 Mon Sep 17 00:00:00 2001 From: Adnan AlSinan Date: Wed, 29 Sep 2021 16:50:46 +0100 Subject: Improve performance of Softmax uint8 on GPU Resolves COMPMID-4805 Change-Id: I0acd4479f196cf9518995a60d3b57a9a49e0db57 Signed-off-by: Adnan AlSinan Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6413 Tested-by: Arm Jenkins Comments-Addressed: Pablo Marquez Tello Comments-Addressed: Arm Jenkins Reviewed-by: Pablo Marquez Tello Reviewed-by: Freddie Liardet Reviewed-by: Gian Marco Iodice --- .../cl_kernels/common/softmax_layer_quantized.cl | 97 +++++++++++----------- src/gpu/cl/kernels/ClSoftmaxKernel.cpp | 28 ++----- 2 files changed, 54 insertions(+), 71 deletions(-) diff --git a/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl index 4d5006d804..192c5f97a1 100644 --- a/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl +++ b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl @@ -27,6 +27,7 @@ #define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) #define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE) +#define VEC_FLOAT VEC_DATA_TYPE(float, VECTOR_SIZE) /** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel. * @@ -76,37 +77,31 @@ __kernel void softmax_layer_norm_quantized( Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum); +#ifdef BETA + // Initialize beta + VEC_FLOAT beta = (VEC_FLOAT)BETA; + VEC_FLOAT scale_beta = -BETA * SCALE; +#else /* BETA */ + VEC_FLOAT scale_beta = -SCALE; +#endif /* BETA */ + // Load max value of 1D logits vector (row) - int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1))); - - // It will be better to calculate this in prev layer and pass here as parameter - uint sum_val_u = convert_uint(sum_val); - int headroom_plus_one = clz(sum_val_u); - int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one; - int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31)); - VEC_INT shifted_sum_minus_one = shifted_sum_minus_one_1; - VEC_INT shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, VECTOR_SIZE); - - // It was already calculated in prev layer, should be stored into tmp output and reused - VEC_INT data_diff = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr); - VEC_INT data_diff_mult = data_diff; -#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) - if(INPUT_BETA_MULTIPLIER > 1) - { - data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE); - } -#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */ + float sum_val = *((__global float *)offset(&sum, 0, get_global_id(1))); + float sum_val_inverse = 256.f / sum_val; + + VEC_INT data_diff = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr); + VEC_FLOAT data_diff_f = CONVERT(data_diff, VEC_FLOAT); + + data_diff_f *= scale_beta; + data_diff_f = exp(data_diff_f); + data_diff_f *= sum_val_inverse; - VEC_INT data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE); - data = ASYMM_MULT(shifted_scale, data, VECTOR_SIZE); - data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, VECTOR_SIZE); #ifdef QASYMM8_SIGNED - data += (VEC_INT)(MIN_VALUE); + data_diff_f -= 128.f; #endif /* QASYMM8_SIGNED */ - data = select(MIN_VALUE, data, data_diff >= (VEC_INT)(DIFF_MIN)); + VEC_INT data = CONVERT(data_diff_f, VEC_INT); VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)); - - STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) + STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); } #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) @@ -189,6 +184,14 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_serial( VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE); +#ifdef BETA + // Initialize beta + VEC_FLOAT beta = (VEC_FLOAT)BETA; + VEC_FLOAT scale_beta = -BETA * SCALE; +#else /* BETA */ + VEC_FLOAT scale_beta = -SCALE; +#endif /* BETA */ + // Calculate max of row #ifdef NON_MULTIPLE_OF_VECTOR_SIZE VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE); @@ -210,39 +213,35 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_serial( // Second part // Load max value of 1D logits vector (row) - int max_val = convert_int(max_local); - - // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS) - VEC_INT sum1D = 0; - + int max_val = convert_int(max_local); + VEC_FLOAT sum1D_f = 0.f; + // Start with the leftover items #ifdef NON_MULTIPLE_OF_VECTOR_SIZE - VEC_INT data_fp = CONVERT(data, VEC_INT); - VEC_INT data_diff = data_fp - max_val; - VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff); - data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE); - data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE); + VEC_INT data_fp = CONVERT(data, VEC_INT); + VEC_INT data_diff = max_val - data_fp; + VEC_FLOAT data_fp_f = CONVERT(data_diff, VEC_FLOAT); + data_fp_f *= scale_beta; + data_fp_f = exp(data_fp_f); + data_fp_f = select(0, data_fp_f, widx); VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER) (data_diff, 0, (__global int *)dst_addr); - data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN)); - sum1D += select(0, data_fp, widx); + sum1D_f += data_fp_f; #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */ - - // Shift values, exp and sum + // Do the rest and compute exp and sum for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE) { - VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE))); - VEC_INT data_fp = CONVERT(data, VEC_INT); - VEC_INT data_diff = data_fp - max_val; - VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff); - data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE); - data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE); + VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE))); + VEC_INT data_fp = CONVERT(data, VEC_INT); + VEC_INT data_diff = max_val - data_fp; + VEC_FLOAT data_fp_f = CONVERT(data_diff, VEC_FLOAT); + data_fp_f *= scale_beta; + data_fp_f = exp(data_fp_f); + sum1D_f += data_fp_f; VSTORE(VECTOR_SIZE) (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int))); - sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN)); } - // Perform sum reduction - *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE); + *((__global float *)sum.ptr) = SUM_REDUCE(sum1D_f, VECTOR_SIZE); } /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value, diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp index 4c00413469..467bbe46ce 100644 --- a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp +++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp @@ -177,8 +177,7 @@ void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type); const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0; - ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size); - const unsigned int vector_size = adjust_vec_size(std::get<1>(parallel_reduction_info), reduction_dim_size); + const unsigned int vector_size = adjust_vec_size(_serial_vector_size, reduction_dim_size); // Set build options CLBuildOptions build_opts; @@ -193,29 +192,12 @@ void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX"); build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX"))); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options()); cl::NDRange lws_hint(cl::NullRange); - std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : ""); - - // Configure parallel kernel if needed - if(std::get<0>(parallel_reduction_info)) - { - kernel_name += "parallel"; - bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0); - build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size)); - - // Handle boundary conditions. - const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size; - build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE"); - // Setting _lws_hint in this way can also communicate grid_size to ClLogits1DMaxShiftExpSumKernel::run(). - // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0]. - lws_hint = cl::NDRange(_grid_size); - } - else - { - kernel_name += "serial"; - } + std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial"; // Create kernel. _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); @@ -313,6 +295,8 @@ void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, co build_opts.add_options_if(is_quantized_asymmetric, prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options()); build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX"); + build_opts.add_option_if(is_quantized_asymmetric, "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); + build_opts.add_option_if(is_quantized_asymmetric, "-DBETA=" + float_to_string_with_full_precision(info.beta)); // Create kernel std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : ""); -- cgit v1.2.1