From 671d4f01d96b62a24cf0688059118a1e7908650e Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Wed, 14 Oct 2020 12:26:51 +0100 Subject: COMPMID-3724: Remove OpenCL padding: CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel COMPMID-3725: Remove OpenCL padding: CLGEMMLowpQuantizeDownInt32ScaleKernel Change-Id: Idea5974a56861efae3bc255f1224c7f1e88f3650 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4182 Tested-by: Arm Jenkins Reviewed-by: Giorgio Arena Comments-Addressed: Arm Jenkins --- src/core/CL/cl_kernels/gemmlowp.cl | 98 +++++++++++++++++++--------------- src/core/CL/cl_kernels/helpers_asymm.h | 32 ++++++++--- 2 files changed, 81 insertions(+), 49 deletions(-) (limited to 'src/core/CL/cl_kernels') diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index 29314ec581..c962d3c20e 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -1896,6 +1896,7 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. * These values can be used to implement "rectified linear unit" activation functions + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE * * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -1925,7 +1926,7 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) { // Compute source and destination addresses - int x = get_global_id(0) * 4; + int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); int y = get_global_id(1); int z = get_global_id(2); @@ -1933,18 +1934,20 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; - int4 input_values = vload4(0, (__global int *)src_addr); + VEC_DATA_TYPE(int, VEC_SIZE) + input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); #if defined(ADD_BIAS) // Add bias __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); - int4 biases_values = vload4(0, (__global int *)bias_addr); - input_values += (int4)biases_values; + VEC_DATA_TYPE(int, VEC_SIZE) + biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); + input_values += biases_values; #endif // defined(ADD_BIAS) // Add the offset terms to GEMM's result - input_values += (int4)RESULT_OFFSET; + input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET; // Multiply by result_mult_int and shift input_values *= RESULT_MULT_INT; @@ -1955,18 +1958,18 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), input_values >>= RESULT_SHIFT; #endif // RESULT_SHIFT < 0 - VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) - res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) + res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); #if defined(MIN_BOUND) - res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); + res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); #endif // defined(MIN_BOUND) #if defined(MAX_BOUND) - res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); + res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); #endif // defined(MAX_BOUND) // Store the result - vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); + STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } #endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) @@ -1991,7 +1994,8 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. * These values can be used to implement "rectified linear unit" activation functions - * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE * * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -2021,7 +2025,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO TENSOR3D_DECLARATION(dst)) { // Compute source and destination addresses - int x = max((int)(get_global_id(0) * 4 - (4 - VEC_SIZE_LEFTOVER) % 4), 0); + int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); int y = get_global_id(1); int z = get_global_id(2); @@ -2029,38 +2033,40 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; - int4 input_values = vload4(0, (__global int *)src_addr); + VEC_DATA_TYPE(int, VEC_SIZE) + input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); #if defined(ADD_BIAS) // Add bias __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); - int4 biases_values = vload4(0, (__global int *)bias_addr); - input_values += (int4)biases_values; + VEC_DATA_TYPE(int, VEC_SIZE) + biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); + input_values += biases_values; #endif // defined(ADD_BIAS) // Multiply by result_mult_int and shift #if RESULT_SHIFT < 0 - input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); #else // RESULT_SHIFT >= 0 - input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); #endif // RESULT_SHIFT < 0 // Add the offset terms to GEMM's result - input_values += (int4)RESULT_OFFSET_AFTER_SHIFT; + input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT; - VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) - res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) + res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); #if defined(MIN_BOUND) - res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); + res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); #endif // defined(MIN_BOUND) #if defined(MAX_BOUND) - res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); + res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); #endif // defined(MAX_BOUND) // Store the result - STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, 4, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) + STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } #endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) @@ -2083,7 +2089,8 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. * These values can be used to implement "rectified linear unit" activation functions - * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE * * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -2113,7 +2120,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE TENSOR3D_DECLARATION(dst)) { // Compute source and destination addresses - int x = max((int)(get_global_id(0) * 4 - (4 - VEC_SIZE_LEFTOVER) % 4), 0); + int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); int y = get_global_id(1); int z = get_global_id(2); @@ -2121,34 +2128,37 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z; - int4 input_values = vload4(0, (__global int *)src_addr); + VEC_DATA_TYPE(int, VEC_SIZE) + input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); #if defined(ADD_BIAS) // Add bias __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); - int4 biases_values = vload4(0, (__global int *)bias_addr); - input_values += (int4)biases_values; + VEC_DATA_TYPE(int, VEC_SIZE) + biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); + input_values += biases_values; #endif // defined(ADD_BIAS) // Multiply by result_mult_int and shift #if RESULT_SHIFT < 0 - input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); #else // RESULT_SHIFT >= 0 - input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); #endif // RESULT_SHIFT < 0 - short4 res0 = convert_short4_sat(input_values); + VEC_DATA_TYPE(short, VEC_SIZE) + res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE)); #if defined(MIN_BOUND) - res0 = max(res0, (short4)MIN_BOUND); + res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND); #endif // defined(MIN_BOUND) #if defined(MAX_BOUND) - res0 = min(res0, (short4)MAX_BOUND); + res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND); #endif // defined(MAX_BOUND) // Store the result - STORE_VECTOR_SELECT(res, short, dst_addr, 4, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) + STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } #endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) @@ -2173,6 +2183,8 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. * These values can be used to implement "rectified linear unit" activation functions + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE * * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -2208,7 +2220,7 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src #endif // defined(DST_HEIGHT) { // Compute source and destination addresses - int x = get_global_id(0) * 4; + int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); int y = get_global_id(1); int z = get_global_id(2); @@ -2216,13 +2228,15 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; - int4 input_values = vload4(0, (__global int *)src_addr); + VEC_DATA_TYPE(int, VEC_SIZE) + input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); #if defined(ADD_BIAS) // Add bias __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); - int4 biases_values = vload4(0, (__global int *)bias_addr); + VEC_DATA_TYPE(int, VEC_SIZE) + biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); input_values += (int4)biases_values; #endif // defined(ADD_BIAS) @@ -2230,17 +2244,17 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src float4 input_values_f = convert_float4(input_values); input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET); - VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) - res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) + res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); #if defined(MIN_BOUND) - res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); + res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); #endif // defined(MIN_BOUND) #if defined(MAX_BOUND) - res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); + res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); #endif // defined(MAX_BOUND) // Store the result - vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); + STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } #endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET) diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h index 70134af6ee..4a955ae3eb 100644 --- a/src/core/CL/cl_kernels/helpers_asymm.h +++ b/src/core/CL/cl_kernels/helpers_asymm.h @@ -123,8 +123,8 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) VEC_DATA_TYPE(int, size) \ mask = (one << exponent) - one; \ VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, x < 0); \ - return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ + threshold = (mask >> 1) + select(zero, one, (SELECT_DATA_TYPE(int, size))(x < 0)); \ + return (x >> exponent) + select(zero, one, (SELECT_DATA_TYPE(int, size))((x & mask) > threshold)); \ } /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), @@ -153,12 +153,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) VEC_DATA_TYPE(long, size) \ is_positive_or_zero = ab_64 >= 0; \ VEC_DATA_TYPE(long, size) \ - nudge = select(mask2, mask1, is_positive_or_zero); \ + nudge = select(mask2, mask1, (SELECT_DATA_TYPE(long, size))(is_positive_or_zero)); \ VEC_DATA_TYPE(long, size) \ mask = 1ll << 31; \ VEC_DATA_TYPE(int, size) \ ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ - return select(ab_x2_high32, INT_MAX, overflow); \ + return select(ab_x2_high32, INT_MAX, (SELECT_DATA_TYPE(int, size))(overflow)); \ } /** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). @@ -216,7 +216,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) { \ const VEC_DATA_TYPE(int, size) all_zeros = 0; \ const VEC_DATA_TYPE(int, size) all_ones = ~0; \ - return select(all_zeros, all_ones, a == 0); \ + return select(all_zeros, all_ones, (SELECT_DATA_TYPE(int, size))(a == 0)); \ } /** For each element of input vector, the corresponding bits of the result item are set @@ -231,7 +231,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) { \ const VEC_DATA_TYPE(int, size) all_zeros = 0; \ const VEC_DATA_TYPE(int, size) all_ones = ~0; \ - return select(all_zeros, all_ones, a != 0); \ + return select(all_zeros, all_ones, (SELECT_DATA_TYPE(int, size))(a != 0)); \ } #define EXP_BARREL_SHIFTER_IMPL(size) \ @@ -338,7 +338,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) const VEC_DATA_TYPE(long, size) one = 1; \ const VEC_DATA_TYPE(long, size) minus_one = -1; \ VEC_DATA_TYPE(long, size) \ - sign = select(minus_one, one, sum >= 0); \ + sign = select(minus_one, one, (SELECT_DATA_TYPE(long, size))(sum >= 0)); \ return convert_int##size((sum + sign) / 2); \ } @@ -446,73 +446,91 @@ DEQUANTIZE_IMPL(int, 16) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) ASYMM_MULT_IMPL(1) ASYMM_MULT_IMPL(2) +ASYMM_MULT_IMPL(3) ASYMM_MULT_IMPL(4) ASYMM_MULT_IMPL(8) ASYMM_MULT_IMPL(16) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1) ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3) ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) ASYMM_SELECT_USING_MASK_IMPL(1) ASYMM_SELECT_USING_MASK_IMPL(2) +ASYMM_SELECT_USING_MASK_IMPL(3) ASYMM_SELECT_USING_MASK_IMPL(4) ASYMM_SELECT_USING_MASK_IMPL(8) ASYMM_SELECT_USING_MASK_IMPL(16) ASYMM_MASK_IF_ZERO_IMPL(1) ASYMM_MASK_IF_ZERO_IMPL(2) +ASYMM_MASK_IF_ZERO_IMPL(3) ASYMM_MASK_IF_ZERO_IMPL(4) ASYMM_MASK_IF_ZERO_IMPL(8) ASYMM_MASK_IF_ZERO_IMPL(16) ASYMM_MASK_IF_NON_ZERO_IMPL(1) ASYMM_MASK_IF_NON_ZERO_IMPL(2) +ASYMM_MASK_IF_NON_ZERO_IMPL(3) ASYMM_MASK_IF_NON_ZERO_IMPL(4) ASYMM_MASK_IF_NON_ZERO_IMPL(8) ASYMM_MASK_IF_NON_ZERO_IMPL(16) +EXP_BARREL_SHIFTER_IMPL(1) EXP_BARREL_SHIFTER_IMPL(2) +EXP_BARREL_SHIFTER_IMPL(3) EXP_BARREL_SHIFTER_IMPL(4) EXP_BARREL_SHIFTER_IMPL(8) EXP_BARREL_SHIFTER_IMPL(16) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1) ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3) ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) +ASYMM_ROUNDING_HALF_SUM_IMPL(1) ASYMM_ROUNDING_HALF_SUM_IMPL(2) +ASYMM_ROUNDING_HALF_SUM_IMPL(3) ASYMM_ROUNDING_HALF_SUM_IMPL(4) ASYMM_ROUNDING_HALF_SUM_IMPL(8) ASYMM_ROUNDING_HALF_SUM_IMPL(16) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) ASYMM_RESCALE_IMPL(1) ASYMM_RESCALE_IMPL(2) +ASYMM_RESCALE_IMPL(3) ASYMM_RESCALE_IMPL(4) ASYMM_RESCALE_IMPL(8) ASYMM_RESCALE_IMPL(16) MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3) MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) -- cgit v1.2.1