aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2020-10-14 12:26:51 +0100
committerMichele Di Giorgio <michele.digiorgio@arm.com>2020-10-16 17:19:40 +0000
commit671d4f01d96b62a24cf0688059118a1e7908650e (patch)
tree33ee626be7de34f0c7fb91da9cb136004c361cb7
parent3b9a564fd4573d7cf09e3203eb8a9a30fd5969c9 (diff)
downloadComputeLibrary-671d4f01d96b62a24cf0688059118a1e7908650e.tar.gz
COMPMID-3724: Remove OpenCL padding: CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
COMPMID-3725: Remove OpenCL padding: CLGEMMLowpQuantizeDownInt32ScaleKernel Change-Id: Idea5974a56861efae3bc255f1224c7f1e88f3650 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4182 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h9
-rw-r--r--src/core/CL/cl_kernels/gemmlowp.cl98
-rw-r--r--src/core/CL/cl_kernels/helpers_asymm.h32
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp1
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp45
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp55
-rw-r--r--tests/validation/CL/GEMMLowp.cpp20
7 files changed, 128 insertions, 132 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
index 767d7927b4..1a284f0701 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
@@ -42,7 +42,7 @@ class ICLTensor;
* -# Clamp the value between the specified min and max bounds
* -# Clamp the resulting int32 values:
* -# -to the [0..255] range and cast to QASYMM8.
- * -# -to the [-128..127] range and cast to QASYMM8/SIGNED.
+ * -# -to the [-128..127] range and cast to QASYMM8_SIGNED.
*
*/
class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel
@@ -93,10 +93,9 @@ public:
void run(const Window &window, cl::CommandQueue &queue) override;
private:
- const ICLTensor *_input;
- const ICLTensor *_bias;
- ICLTensor *_output;
- const GEMMLowpOutputStageInfo *_output_stage;
+ const ICLTensor *_input;
+ const ICLTensor *_bias;
+ ICLTensor *_output;
};
} // namespace arm_compute
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 29314ec581..c962d3c20e 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1896,6 +1896,7 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC
* @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
* These values can be used to implement "rectified linear unit" activation functions
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
*
* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -1925,7 +1926,7 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst))
{
// Compute source and destination addresses
- int x = get_global_id(0) * 4;
+ int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
int y = get_global_id(1);
int z = get_global_id(2);
@@ -1933,18 +1934,20 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
- int4 input_values = vload4(0, (__global int *)src_addr);
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
#if defined(ADD_BIAS)
// Add bias
__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
- int4 biases_values = vload4(0, (__global int *)bias_addr);
- input_values += (int4)biases_values;
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+ input_values += biases_values;
#endif // defined(ADD_BIAS)
// Add the offset terms to GEMM's result
- input_values += (int4)RESULT_OFFSET;
+ input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET;
// Multiply by result_mult_int and shift
input_values *= RESULT_MULT_INT;
@@ -1955,18 +1958,18 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
input_values >>= RESULT_SHIFT;
#endif // RESULT_SHIFT < 0
- VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
- res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+ VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+ res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
#if defined(MIN_BOUND)
- res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+ res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
#endif // defined(MIN_BOUND)
#if defined(MAX_BOUND)
- res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+ res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
#endif // defined(MAX_BOUND)
// Store the result
- vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+ STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
@@ -1991,7 +1994,8 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
* @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
* These values can be used to implement "rectified linear unit" activation functions
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
*
* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -2021,7 +2025,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
TENSOR3D_DECLARATION(dst))
{
// Compute source and destination addresses
- int x = max((int)(get_global_id(0) * 4 - (4 - VEC_SIZE_LEFTOVER) % 4), 0);
+ int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
int y = get_global_id(1);
int z = get_global_id(2);
@@ -2029,38 +2033,40 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
- int4 input_values = vload4(0, (__global int *)src_addr);
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
#if defined(ADD_BIAS)
// Add bias
__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
- int4 biases_values = vload4(0, (__global int *)bias_addr);
- input_values += (int4)biases_values;
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+ input_values += biases_values;
#endif // defined(ADD_BIAS)
// Multiply by result_mult_int and shift
#if RESULT_SHIFT < 0
- input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
#else // RESULT_SHIFT >= 0
- input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
#endif // RESULT_SHIFT < 0
// Add the offset terms to GEMM's result
- input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
+ input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT;
- VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
- res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+ VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+ res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
#if defined(MIN_BOUND)
- res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+ res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
#endif // defined(MIN_BOUND)
#if defined(MAX_BOUND)
- res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+ res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
#endif // defined(MAX_BOUND)
// Store the result
- STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, 4, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+ STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
@@ -2083,7 +2089,8 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
* These values can be used to implement "rectified linear unit" activation functions
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
*
* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -2113,7 +2120,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
TENSOR3D_DECLARATION(dst))
{
// Compute source and destination addresses
- int x = max((int)(get_global_id(0) * 4 - (4 - VEC_SIZE_LEFTOVER) % 4), 0);
+ int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
int y = get_global_id(1);
int z = get_global_id(2);
@@ -2121,34 +2128,37 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z;
- int4 input_values = vload4(0, (__global int *)src_addr);
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
#if defined(ADD_BIAS)
// Add bias
__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
- int4 biases_values = vload4(0, (__global int *)bias_addr);
- input_values += (int4)biases_values;
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+ input_values += biases_values;
#endif // defined(ADD_BIAS)
// Multiply by result_mult_int and shift
#if RESULT_SHIFT < 0
- input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
#else // RESULT_SHIFT >= 0
- input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
#endif // RESULT_SHIFT < 0
- short4 res0 = convert_short4_sat(input_values);
+ VEC_DATA_TYPE(short, VEC_SIZE)
+ res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE));
#if defined(MIN_BOUND)
- res0 = max(res0, (short4)MIN_BOUND);
+ res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND);
#endif // defined(MIN_BOUND)
#if defined(MAX_BOUND)
- res0 = min(res0, (short4)MAX_BOUND);
+ res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND);
#endif // defined(MAX_BOUND)
// Store the result
- STORE_VECTOR_SELECT(res, short, dst_addr, 4, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+ STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
@@ -2173,6 +2183,8 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
* @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
* These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
*
* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -2208,7 +2220,7 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
#endif // defined(DST_HEIGHT)
{
// Compute source and destination addresses
- int x = get_global_id(0) * 4;
+ int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
int y = get_global_id(1);
int z = get_global_id(2);
@@ -2216,13 +2228,15 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
- int4 input_values = vload4(0, (__global int *)src_addr);
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
#if defined(ADD_BIAS)
// Add bias
__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
- int4 biases_values = vload4(0, (__global int *)bias_addr);
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
input_values += (int4)biases_values;
#endif // defined(ADD_BIAS)
@@ -2230,17 +2244,17 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
float4 input_values_f = convert_float4(input_values);
input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
- VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
- res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+ VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+ res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
#if defined(MIN_BOUND)
- res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+ res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
#endif // defined(MIN_BOUND)
#if defined(MAX_BOUND)
- res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+ res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
#endif // defined(MAX_BOUND)
// Store the result
- vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+ STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 70134af6ee..4a955ae3eb 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -123,8 +123,8 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
VEC_DATA_TYPE(int, size) \
mask = (one << exponent) - one; \
VEC_DATA_TYPE(int, size) \
- threshold = (mask >> 1) + select(zero, one, x < 0); \
- return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
+ threshold = (mask >> 1) + select(zero, one, (SELECT_DATA_TYPE(int, size))(x < 0)); \
+ return (x >> exponent) + select(zero, one, (SELECT_DATA_TYPE(int, size))((x & mask) > threshold)); \
}
/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -153,12 +153,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
VEC_DATA_TYPE(long, size) \
is_positive_or_zero = ab_64 >= 0; \
VEC_DATA_TYPE(long, size) \
- nudge = select(mask2, mask1, is_positive_or_zero); \
+ nudge = select(mask2, mask1, (SELECT_DATA_TYPE(long, size))(is_positive_or_zero)); \
VEC_DATA_TYPE(long, size) \
mask = 1ll << 31; \
VEC_DATA_TYPE(int, size) \
ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \
- return select(ab_x2_high32, INT_MAX, overflow); \
+ return select(ab_x2_high32, INT_MAX, (SELECT_DATA_TYPE(int, size))(overflow)); \
}
/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
@@ -216,7 +216,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
{ \
const VEC_DATA_TYPE(int, size) all_zeros = 0; \
const VEC_DATA_TYPE(int, size) all_ones = ~0; \
- return select(all_zeros, all_ones, a == 0); \
+ return select(all_zeros, all_ones, (SELECT_DATA_TYPE(int, size))(a == 0)); \
}
/** For each element of input vector, the corresponding bits of the result item are set
@@ -231,7 +231,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
{ \
const VEC_DATA_TYPE(int, size) all_zeros = 0; \
const VEC_DATA_TYPE(int, size) all_ones = ~0; \
- return select(all_zeros, all_ones, a != 0); \
+ return select(all_zeros, all_ones, (SELECT_DATA_TYPE(int, size))(a != 0)); \
}
#define EXP_BARREL_SHIFTER_IMPL(size) \
@@ -338,7 +338,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
const VEC_DATA_TYPE(long, size) one = 1; \
const VEC_DATA_TYPE(long, size) minus_one = -1; \
VEC_DATA_TYPE(long, size) \
- sign = select(minus_one, one, sum >= 0); \
+ sign = select(minus_one, one, (SELECT_DATA_TYPE(long, size))(sum >= 0)); \
return convert_int##size((sum + sign) / 2); \
}
@@ -446,73 +446,91 @@ DEQUANTIZE_IMPL(int, 16)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
ASYMM_MULT_IMPL(1)
ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(3)
ASYMM_MULT_IMPL(4)
ASYMM_MULT_IMPL(8)
ASYMM_MULT_IMPL(16)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1)
ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3)
ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
ASYMM_SELECT_USING_MASK_IMPL(1)
ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(3)
ASYMM_SELECT_USING_MASK_IMPL(4)
ASYMM_SELECT_USING_MASK_IMPL(8)
ASYMM_SELECT_USING_MASK_IMPL(16)
ASYMM_MASK_IF_ZERO_IMPL(1)
ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(3)
ASYMM_MASK_IF_ZERO_IMPL(4)
ASYMM_MASK_IF_ZERO_IMPL(8)
ASYMM_MASK_IF_ZERO_IMPL(16)
ASYMM_MASK_IF_NON_ZERO_IMPL(1)
ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(3)
ASYMM_MASK_IF_NON_ZERO_IMPL(4)
ASYMM_MASK_IF_NON_ZERO_IMPL(8)
ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+EXP_BARREL_SHIFTER_IMPL(1)
EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(3)
EXP_BARREL_SHIFTER_IMPL(4)
EXP_BARREL_SHIFTER_IMPL(8)
EXP_BARREL_SHIFTER_IMPL(16)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1)
ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3)
ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+ASYMM_ROUNDING_HALF_SUM_IMPL(1)
ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(3)
ASYMM_ROUNDING_HALF_SUM_IMPL(4)
ASYMM_ROUNDING_HALF_SUM_IMPL(8)
ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1)
ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3)
ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
ASYMM_RESCALE_IMPL(1)
ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(3)
ASYMM_RESCALE_IMPL(4)
ASYMM_RESCALE_IMPL(8)
ASYMM_RESCALE_IMPL(16)
MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3)
MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
index ff4136c5f0..eae66413a6 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
@@ -97,6 +97,7 @@ void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi
auto min = info->gemmlowp_min_bound;
auto max = info->gemmlowp_max_bound;
CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index 242d151272..430a84cfa0 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
@@ -65,38 +64,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, DataType output_data_type)
-{
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_data_type(output_data_type));
-
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win,
- input_access);
-
- AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_result_access);
- output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} // namespace
class Coordinates;
@@ -127,15 +94,22 @@ void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
+
_input = input;
_bias = bias;
_output = output;
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
+
auto min = info->gemmlowp_min_bound;
auto max = info->gemmlowp_max_bound;
// Set the arguments to pass at compile time
CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
@@ -147,9 +121,8 @@ void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon
_kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info->output_data_type);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
}
void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 55e4ed2bd9..79888cdba2 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
@@ -62,41 +61,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, DataType output_data_type)
-{
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_data_type(output_data_type));
-
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
- // Configure kernel window
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win,
- input_access);
-
- AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_result_access);
- output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} //namespace
CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr), _output_stage(nullptr)
+ : _input(nullptr), _bias(nullptr), _output(nullptr)
{
}
+
Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -110,7 +81,8 @@ void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ICLTensor *input, c
configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, output_stage);
}
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage)
+void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+ const GEMMLowpOutputStageInfo *output_stage)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -120,15 +92,21 @@ void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &c
output->info(),
output_stage));
- _input = input;
- _bias = bias;
- _output = output;
- _output_stage = output_stage;
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type));
+
+ _input = input;
+ _bias = bias;
+ _output = output;
+
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
// Set the arguments to pass at compile time
auto min = output_stage->gemmlowp_min_bound;
auto max = output_stage->gemmlowp_max_bound;
CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
@@ -143,9 +121,8 @@ void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &c
_kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options());
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), output_stage->output_data_type);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
}
void CLGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
index 8d5ac24de8..00f831b2e2 100644
--- a/tests/validation/CL/GEMMLowp.cpp
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -48,7 +48,7 @@ namespace
{
constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
-bool validate_output_stage_zero_padding(const TensorShape shape, const DataType dt)
+bool validate_output_stage_zero_padding(const TensorShape shape, const DataType dt, const GEMMLowpOutputStageType type)
{
// Create tensors
CLTensor src = create_tensor<CLTensor>(shape, DataType::S32, 1);
@@ -56,7 +56,7 @@ bool validate_output_stage_zero_padding(const TensorShape shape, const DataType
CLTensor dst = create_tensor<CLTensor>(shape, dt, 1);
GEMMLowpOutputStageInfo info;
- info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ info.type = type;
info.output_data_type = dt;
std::tie(info.gemmlowp_min_bound, info.gemmlowp_max_bound) = quantization::get_min_max_values_from_quantized_data_type(dt);
@@ -147,6 +147,13 @@ TEST_SUITE(OutputStage)
TEST_SUITE(QuantizeDownInt32Scale)
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::QASYMM8, DataType::QASYMM8_SIGNED })),
+ shape, data_type)
+{
+ bool status = validate_output_stage_zero_padding(shape, data_type, GEMMLowpOutputStageType::QUANTIZE_DOWN);
+ ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
TEST_SUITE(QASYMM8)
const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
@@ -208,7 +215,7 @@ TEST_SUITE(QuantizeDownInt32ScaleByFixedPoint)
DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16 })),
shape, data_type)
{
- bool status = validate_output_stage_zero_padding(shape, data_type);
+ bool status = validate_output_stage_zero_padding(shape, data_type, GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
}
@@ -346,6 +353,13 @@ TEST_SUITE_END() // QuantizeDownInt32ScaleByFixedPoint
TEST_SUITE(QuantizeDownInt32ScaleByFloat)
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::QASYMM8, DataType::QASYMM8_SIGNED })),
+ shape, data_type)
+{
+ bool status = validate_output_stage_zero_padding(shape, data_type, GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT);
+ ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
TEST_SUITE(QASYMM8)
using CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture =
GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage, uint8_t>;