aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2020-10-26 15:04:08 +0000
committerGiorgio Arena <giorgio.arena@arm.com>2020-11-12 12:42:51 +0000
commit2d1a835b68eb27a800838fc2b563b12eddf2c19f (patch)
tree228dee073d37d2ec5b5dfbdb3d0e1e512ecb2d22
parent00c7601b1f9c3bec1d3b1db844abb513b9012541 (diff)
downloadComputeLibrary-2d1a835b68eb27a800838fc2b563b12eddf2c19f.tar.gz
COMPMID-3735 Remove OpenCL padding: CLSoftmaxLayerKernel
- Renamed SELECT_DATA_TYPE to SELECT_VEC_DATA_TYPE to reflect its usage with vectors. SELECT_DATA_TYPE(dt) will now return the primitive data type - Changed the interface of VEC_OFFS and V_OFFS in order to receive the primitive data type as a parameter rather than its vector form - Performed a general cleanup of the kernels, such as creating macro for sum and max reduces, remove reduntant macros, defines, variables, calculations, etc... - Using VEC_SIZE and VEC_SIZE_LEFTOVER in every kernel in order to allow computation for smaller shapes without adding paddings - Removed the actual padding from the kernel and adjusting its calculations accordingly. Added asserts for padding removal checks. Removed invalid Validate tests. Change-Id: If5ccbd5d34e255d38c7f6bfe8740e2b80b28e264 Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4277 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/activation_float_helpers.h2
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution.cl24
-rw-r--r--src/core/CL/cl_kernels/elementwise_operation.cl2
-rw-r--r--src/core/CL/cl_kernels/elementwise_operation_quantized.cl2
-rw-r--r--src/core/CL/cl_kernels/helpers.h62
-rw-r--r--src/core/CL/cl_kernels/helpers_asymm.h29
-rw-r--r--src/core/CL/cl_kernels/pad_layer.cl4
-rw-r--r--src/core/CL/cl_kernels/pooling_layer.cl6
-rw-r--r--src/core/CL/cl_kernels/pooling_layer_quantized.cl37
-rw-r--r--src/core/CL/cl_kernels/select.cl19
-rw-r--r--src/core/CL/cl_kernels/softmax_layer.cl415
-rw-r--r--src/core/CL/cl_kernels/softmax_layer_quantized.cl493
-rw-r--r--src/core/CL/cl_kernels/winograd_input_transform.cl92
-rw-r--r--src/core/CL/cl_kernels/yolo_layer.cl2
-rw-r--r--src/core/CL/kernels/CLSoftmaxLayerKernel.cpp136
-rw-r--r--tests/validation/CL/SoftmaxLayer.cpp10
16 files changed, 570 insertions, 765 deletions
diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h
index 8bd6aad42e..91d7197889 100644
--- a/src/core/CL/cl_kernels/activation_float_helpers.h
+++ b/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -55,7 +55,7 @@
#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
// ELU Activation
-#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
+#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
// Absolute Activation
#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 5aba2061b4..81fa01ae99 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -1476,17 +1476,17 @@ __kernel void dwc_MxN_native_fp_nhwc(
#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond) \
- ({ \
- basename##0 = select(basename##0, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_DATA_TYPE(data_type, vec_size))((cond).s0)); \
- basename##1 = select(basename##1, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_DATA_TYPE(data_type, vec_size))((cond).s1)); \
- basename##2 = select(basename##2, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_DATA_TYPE(data_type, vec_size))((cond).s2)); \
+#define FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond) \
+ ({ \
+ basename##0 = select(basename##0, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s0)); \
+ basename##1 = select(basename##1, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s1)); \
+ basename##2 = select(basename##2, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s2)); \
})
-#define FILL_ZERO_OUT_OF_BOUND_4(data_type, vec_size, basename, cond) \
- ({ \
- FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond); \
- basename##3 = select(basename##3, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_DATA_TYPE(data_type, vec_size))((cond).s3)); \
+#define FILL_ZERO_OUT_OF_BOUND_4(data_type, vec_size, basename, cond) \
+ ({ \
+ FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond); \
+ basename##3 = select(basename##3, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s3)); \
})
#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
@@ -1728,8 +1728,8 @@ __kernel void depthwise_convolution_3x3_nhwc_stride1(
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
#endif /* defined(DST_DEPTH) */
- int4 src_coord_y = (int4)(y * NUM_ROWS_PROCESSED - CONV_PAD_LEFT) + V_OFFS4(int4);
- int4 src_coord_z = (int4)(z * NUM_PLANES_PROCESSED - CONV_PAD_TOP) + V_OFFS4(int4);
+ int4 src_coord_y = (int4)(y * NUM_ROWS_PROCESSED - CONV_PAD_LEFT) + V_OFFS4(int);
+ int4 src_coord_z = (int4)(z * NUM_PLANES_PROCESSED - CONV_PAD_TOP) + V_OFFS4(int);
int4 src_offset_y = clamp(src_coord_y, (int4)0, (int4)(SRC_DIM_1 - 1));
int4 src_offset_z = clamp(src_coord_z, (int4)0, (int4)(SRC_DIM_2 - 1));
@@ -1844,7 +1844,7 @@ __kernel void depthwise_convolution_3x3_nhwc_stride1(
acc3 += bias_values;
#endif // defined(HAS_BIAS)
- int2 dst_offset_y = min((int2)(y * NUM_ROWS_PROCESSED) + V_OFFS2(int2), (int2)(DST_DIM_1 - 1)) * (int2)dst_stride_y;
+ int2 dst_offset_y = min((int2)(y * NUM_ROWS_PROCESSED) + V_OFFS2(int), (int2)(DST_DIM_1 - 1)) * (int2)dst_stride_y;
int dst_coord_z = z * NUM_PLANES_PROCESSED;
#if defined(DST_DEPTH)
diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/elementwise_operation.cl
index 3519ef8ea7..f6c09b4ec7 100644
--- a/src/core/CL/cl_kernels/elementwise_operation.cl
+++ b/src/core/CL/cl_kernels/elementwise_operation.cl
@@ -38,7 +38,7 @@
#define SQUARED_DIFF(x, y) (x - y) * (x - y)
#define DIV(x, y) (x / y)
#define POWER(x, y) pow(x, y)
-#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_OUT)0), SELECT_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))))
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_OUT)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))))
#define OP_FUN_NAME_STR(op) elementwise_operation_##op
#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
index 0c512b4b4d..a08c3b2d47 100644
--- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
+++ b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
@@ -28,7 +28,7 @@
#define MAX(x, y) max((x), (y))
#define MIN(x, y) min((x), (y))
#define SQUARED_DIFF(x, y) (x - y) * (x - y)
-#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_OUT)0), SELECT_DATA_TYPE(float, VEC_SIZE_OUT))))
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_OUT)0), SELECT_VEC_DATA_TYPE(float, VEC_SIZE_OUT))))
#define DIV(x, y) (x / y)
#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 1f637ade2f..372ccd91fb 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -172,12 +172,12 @@
* @return The vector filled with offset values
* @{
*/
-#define V_OFFS1(dt) (dt)(0)
-#define V_OFFS2(dt) (dt)(0, 1)
-#define V_OFFS3(dt) (dt)(0, 1, 2)
-#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
-#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
-#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define V_OFFS1(dt) (dt##1)(0)
+#define V_OFFS2(dt) (dt##2)(0, 1)
+#define V_OFFS3(dt) (dt##3)(0, 1, 2)
+#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
/** @} */ // end of group V_OFFSn
/** Create a vector filled with offset values corresponding to the location of each element.
@@ -507,9 +507,6 @@
#define VEC_DATA_TYPE_STR(type, size) type##size
#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
-#define CL_VEC_DATA_TYPE_STR(type, size) type##size
-#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
-
#define CONVERT_STR(x, type) (convert_##type((x)))
#define CONVERT(x, type) CONVERT_STR(x, type)
@@ -519,19 +516,40 @@
#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
-#define select_dt_uchar(size) uchar##size
-#define select_dt_char(size) char##size
-#define select_dt_ushort(size) ushort##size
-#define select_dt_short(size) short##size
-#define select_dt_half(size) short##size
-#define select_dt_uint(size) uint##size
-#define select_dt_int(size) int##size
-#define select_dt_float(size) int##size
-#define select_dt_ulong(size) ulong##size
-#define select_dt_long(size) long##size
-
-#define SELECT_DATA_TYPE_STR(type, size) select_dt_##type(size)
-#define SELECT_DATA_TYPE(type, size) SELECT_DATA_TYPE_STR(type, size)
+#define select_vec_dt_uchar(size) uchar##size
+#define select_vec_dt_char(size) char##size
+#define select_vec_dt_ushort(size) ushort##size
+#define select_vec_dt_short(size) short##size
+#define select_vec_dt_half(size) short##size
+#define select_vec_dt_uint(size) uint##size
+#define select_vec_dt_int(size) int##size
+#define select_vec_dt_float(size) int##size
+#define select_vec_dt_ulong(size) ulong##size
+#define select_vec_dt_long(size) long##size
+
+#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
+#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
+#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
+
+#define sum_reduce_1(x) (x)
+#define sum_reduce_2(x) ((x).s0) + ((x).s1)
+#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
+#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
+#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
+#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
+
+#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
+#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
+
+#define max_reduce_1(x) (x)
+#define max_reduce_2(x) max(((x).s0), ((x).s1))
+#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
+#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
+#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
+#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
+
+#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
+#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
#define VECTOR_DECLARATION(name) \
__global uchar *name##_ptr, \
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 4a955ae3eb..59c8fa606d 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -123,8 +123,8 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
VEC_DATA_TYPE(int, size) \
mask = (one << exponent) - one; \
VEC_DATA_TYPE(int, size) \
- threshold = (mask >> 1) + select(zero, one, (SELECT_DATA_TYPE(int, size))(x < 0)); \
- return (x >> exponent) + select(zero, one, (SELECT_DATA_TYPE(int, size))((x & mask) > threshold)); \
+ threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \
+ return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \
}
/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -153,12 +153,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
VEC_DATA_TYPE(long, size) \
is_positive_or_zero = ab_64 >= 0; \
VEC_DATA_TYPE(long, size) \
- nudge = select(mask2, mask1, (SELECT_DATA_TYPE(long, size))(is_positive_or_zero)); \
+ nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero)); \
VEC_DATA_TYPE(long, size) \
mask = 1ll << 31; \
VEC_DATA_TYPE(int, size) \
ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \
- return select(ab_x2_high32, INT_MAX, (SELECT_DATA_TYPE(int, size))(overflow)); \
+ return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow)); \
}
/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
@@ -216,7 +216,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
{ \
const VEC_DATA_TYPE(int, size) all_zeros = 0; \
const VEC_DATA_TYPE(int, size) all_ones = ~0; \
- return select(all_zeros, all_ones, (SELECT_DATA_TYPE(int, size))(a == 0)); \
+ return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0)); \
}
/** For each element of input vector, the corresponding bits of the result item are set
@@ -231,7 +231,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
{ \
const VEC_DATA_TYPE(int, size) all_zeros = 0; \
const VEC_DATA_TYPE(int, size) all_ones = ~0; \
- return select(all_zeros, all_ones, (SELECT_DATA_TYPE(int, size))(a != 0)); \
+ return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0)); \
}
#define EXP_BARREL_SHIFTER_IMPL(size) \
@@ -338,7 +338,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
const VEC_DATA_TYPE(long, size) one = 1; \
const VEC_DATA_TYPE(long, size) minus_one = -1; \
VEC_DATA_TYPE(long, size) \
- sign = select(minus_one, one, (SELECT_DATA_TYPE(long, size))(sum >= 0)); \
+ sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \
return convert_int##size((sum + sign) / 2); \
}
@@ -390,8 +390,10 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
+#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
@@ -401,11 +403,14 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \
inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/pad_layer.cl
index d2b43aac2b..fe71b5d119 100644
--- a/src/core/CL/cl_kernels/pad_layer.cl
+++ b/src/core/CL/cl_kernels/pad_layer.cl
@@ -27,8 +27,8 @@
#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_SELECT SELECT_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define OFFSETS VEC_OFFS(VEC_SELECT, VEC_SIZE)
+#define VEC_SELECT SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSETS VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VEC_SIZE)
#if defined(CONST_VAL)
/** Perform a pad operation when PaddingMode is CONSTANT
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 680e947149..00250a08a5 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -786,7 +786,7 @@ __kernel void pooling_layer_MxN_nhwc(
}
#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-#define SELECT_TYPE SELECT_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
* -# max, -DPOOL_MAX must be passed at compile time
@@ -957,7 +957,7 @@ __kernel void pooling_layer_2x2_nhwc(
// note: Batch dimension does not contribute in the offset contribution
VEC_DATA_TYPE(uint, VEC_SIZE) base_index = (uint)idx_out_c;
- base_index += VEC_OFFS(VEC_DATA_TYPE(uint, VEC_SIZE), VEC_SIZE);
+ base_index += VEC_OFFS(uint, VEC_SIZE);
VEC_DATA_TYPE(uint, VEC_SIZE) index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
VEC_DATA_TYPE(uint, VEC_SIZE) index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
@@ -978,4 +978,4 @@ __kernel void pooling_layer_2x2_nhwc(
STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(SELECT_DATA_TYPE) && defined(ACC_DATA_TYPE) \ No newline at end of file
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 04fef98cd0..d8cef2b4e6 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -197,29 +197,23 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
{
// Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
// Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
- int offset_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+ int offset_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
int idx_out_w = get_global_id(1);
#if DST_BATCH_SIZE != 1
// If batch size != 1, the batch size dimension is collapsed over the height dimension
int idx_out_h = get_global_id(2) % DST_HEIGHT;
int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else //DST_BATCH_SIZE != 1
- int idx_out_h = get_global_id(2);
- int idx_out_n = 0;
+#else //DST_BATCH_SIZE != 1
+ int idx_out_h = get_global_id(2);
+ int idx_out_n = 0;
#endif // DST_BATCH_SIZE != 1
- int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
- int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+ int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+ int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
- __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
- offset_c +
- idx_out_n * input_stride_w;
+ __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + offset_c + idx_out_n * input_stride_w;
- __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
- offset_c +
- idx_out_w * output_stride_y +
- idx_out_h * output_stride_z +
- idx_out_n * output_stride_w;
+ __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + offset_c + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * output_stride_w;
int pool_x_s = max((int)0, -idx_in_w);
int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
@@ -230,7 +224,7 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
int filter_size = 0;
#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
-#endif // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
+#endif // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
res0 = INITIAL_VALUE;
@@ -239,10 +233,12 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
{
for(int x = pool_x_s; x < pool_x_e; ++x)
{
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data;
- VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) data0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data;
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ data0;
- data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
res0 = POOL_OP(res0, data0);
@@ -257,7 +253,8 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
#endif // defined(POOL_AVG)
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
@@ -265,5 +262,5 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
// Store result
STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(SELECT_DATA_TYPE) && defined(ACC_DATA_TYPE)
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/select.cl
index 4752cc132f..b06a1118a8 100644
--- a/src/core/CL/cl_kernels/select.cl
+++ b/src/core/CL/cl_kernels/select.cl
@@ -75,8 +75,8 @@ __kernel void select_same_rank(
Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
// Load values
- SELECT_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- in_c = CONVERT((VLOAD(VEC_SIZE)(0, (__global uchar *)c_t.ptr)), SELECT_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+ SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ in_c = CONVERT((VLOAD(VEC_SIZE)(0, (__global uchar *)c_t.ptr)), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -84,7 +84,7 @@ __kernel void select_same_rank(
// Calculate and store result
VSTORE(VEC_SIZE)
- (select(in_y, in_x, in_c > (SELECT_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
+ (select(in_y, in_x, in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
}
/** This function perform a select operation between two tensors when condition tensor has a different rank.
@@ -136,7 +136,7 @@ __kernel void select_different_rank_2(
Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
// Load values
- SELECT_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
@@ -145,15 +145,14 @@ __kernel void select_different_rank_2(
// Calculate and store result
VSTORE(VEC_SIZE)
- (select(in_y, in_x, in_c > (SELECT_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
+ (select(in_y, in_x, in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
}
-#endif /* defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) */
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */
-#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE)
/** This function perform a select operation between two tensors when condition tensor has a different rank.
*
* @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
* @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
*
* @param[in] c_ptr Pointer to the source tensor. Supported data types: U8
@@ -200,7 +199,7 @@ __kernel void select_different_rank_n(
Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
// Load values
- SELECT_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
@@ -209,6 +208,6 @@ __kernel void select_different_rank_n(
// Calculate and store result
VSTORE(VEC_SIZE)
- (select(in_y, in_x, in_c > (SELECT_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
+ (select(in_y, in_x, in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
}
#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) */ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 77dbb47e41..01f5de47cf 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -23,55 +23,15 @@
*/
#include "helpers.h"
-#define MAX_OP(x, y, type, size) max((x), (y))
-#define ADD_OP(x, y, type, size) ((x) + (y))
-#define SUB_OP(x, y, type, size) ((x) - (y))
-#define MUL_OP(x, y, type, size) ((x) * (y))
-#define DIV_OP(x, y, type, size) ((x) / (y))
-#define EXP_OP(x, type, size) exp((x))
-
-#ifdef USE_F16
-#define MINVAL -HALF_MAX
-#define SELECT_DATA_TYPE short
-#else /* USE_F16 */
-#define MINVAL -FLT_MAX
-#define SELECT_DATA_TYPE int
-#endif /* USE_F16 */
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-/* Vector size, i.e. number of vector elements. */
-#if VECTOR_SIZE == 2
-__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
-__constant uint2 idx__ = (uint2)(0, 1);
-
-#elif VECTOR_SIZE == 4
-__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
-__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
-
-#elif VECTOR_SIZE == 8
-__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
-__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
-
-#else /* VECTOR_SIZE DEFAULT */
-#define VECTOR_SIZE 16
-#define LOG_VECTOR_SIZE 4
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
-__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
-#endif /* VECTOR_SIZE END */
-
-// TODO (COMPMID-661): Remove if the non-fused kernels are removed
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
-__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-__constant uint4 idx4 = (uint4)(0, 1, 2, 3);
+#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
*
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
*
* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -103,28 +63,49 @@ __kernel void softmax_layer_norm(
TENSOR3D_DECLARATION(sum),
TENSOR3D_DECLARATION(dst))
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
// Load max value of 1D logits vector (row)
DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
- VEC_DATA_TYPE(DATA_TYPE, 16)
- data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
-#ifdef LOG_SOFTMAX
+ VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+ data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(LOG_SOFTMAX)
sum_val = log(sum_val);
- vstore16(SUB_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
-#else /* LOG_SOFTMAX */
- vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
-#endif /* LOG_SOFTMAX */
+ data0 -= sum_val;
+#else // defined(LOG_SOFTMAX)
+ data0 /= sum_val;
+#endif // defined(LOG_SOFTMAX)
+
+ STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
+#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
* then gets the exponent of each element as sums all elements across each row.
*
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
*
* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -158,136 +139,102 @@ __kernel void softmax_layer_norm(
* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in] width Input image width
*/
__kernel void softmax_layer_max_shift_exp_sum_serial(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(maxo),
TENSOR3D_DECLARATION(dst),
- TENSOR3D_DECLARATION(sum),
- uint width)
+ TENSOR3D_DECLARATION(sum))
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
#ifdef BETA
// Initialize beta
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA;
+ VEC_TYPE beta = (VEC_TYPE)BETA;
#endif /* BETA */
// Initialize local maximum
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
-
- // Calculate max of row
- const uint width_ = width >> LOG_VECTOR_SIZE;
- for(uint i = 0; i < width_; i++)
- {
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
- max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
- }
+ VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
- VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
- widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
- max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+ SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
+ max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));
#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+ for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+ {
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+ max_val_vec = max(data, max_val_vec);
+ }
+
// Perform max reduction
-#if VECTOR_SIZE == 16
- max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
-#endif /* VECTOR SIZE 16 END */
-#if VECTOR_SIZE >= 8
- max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
-#endif /* VECTOR SIZE 8 END */
-#if VECTOR_SIZE >= 4
- max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
-#endif /* VECTOR SIZE 4 END */
- max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
- // Store result
- *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
+ DATA_TYPE max_val = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
+ *((__global DATA_TYPE *)maxo.ptr) = max_val;
/* Second section */
- // Load max value of 1D logits vector (row)
- DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
-
// Set sum vector
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- sum1D = 0;
+ VEC_TYPE sum1D = 0;
- // Shift values, exp and sum
- for(uint i = 0; i < width_; i++)
- {
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
- data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+ data -= max_val;
#ifdef BETA
- data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+ data *= beta;
#endif /* BETA */
#ifdef LOG_SOFTMAX
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
- data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+ VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+ (data, 0, (__global DATA_TYPE *)dst_addr);
+ data = exp(data);
+ data = select(0, data, widx);
#else /* LOG_SOFTMAX */
- data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+ data = exp(data);
+ data = select(0, data, widx);
+ VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+ (data, 0, (__global DATA_TYPE *)dst_addr);
#endif /* LOG_SOFTMAX */
- sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
- }
+ sum1D += data;
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
- data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
- data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+ // Shift values, exp and sum
+ for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+ {
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+ data -= max_val;
#ifdef BETA
- data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+ data *= beta;
#endif /* BETA */
#ifdef LOG_SOFTMAX
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
- data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
- widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
- data = select(0, data, widx);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
+ data = exp(data);
#else /* LOG_SOFTMAX */
- data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
- widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
- data = select(0, data, widx);
- VSTORE(VECTOR_SIZE)
- (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
+ data = exp(data);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
#endif /* LOG_SOFTMAX */
- sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+ sum1D += data;
+ }
// Perform sum reduction
-#if VECTOR_SIZE == 16
- sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
-#endif /* VECTOR SIZE 16 END */
-#if VECTOR_SIZE >= 8
- sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
-#endif /* VECTOR SIZE 8 END */
-#if VECTOR_SIZE >= 4
- sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
-#endif /* VECTOR SIZE 4 END */
- sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
-
- // Calculate and store result
- *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+ *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
}
/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
* then gets the exponent of each element as sums all elements across each row.
*
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
*
* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -321,71 +268,59 @@ __kernel void softmax_layer_max_shift_exp_sum_serial(
* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in] width Input image width
*/
__kernel void softmax_layer_max_shift_exp_sum_parallel(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(maxo),
TENSOR3D_DECLARATION(dst),
- TENSOR3D_DECLARATION(sum),
- uint width)
+ TENSOR3D_DECLARATION(sum))
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ const uint lid = get_local_id(0);
+ const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
- const uint lid = get_local_id(0);
-
#ifdef BETA
// Initialize beta
- VEC_DATA_TYPE(DATA_TYPE, 4)
- beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
+ VEC_TYPE beta = (VEC_TYPE)BETA;
#endif /* BETA */
// Define one temporary vector per work-item.
- __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
+ __local VEC_TYPE tmp_local[GRID_SIZE];
__local DATA_TYPE max_local;
- __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
- VEC_DATA_TYPE(DATA_TYPE, 4)
- max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
- // Number of elements per work-item.
- const uint row = width / GRID_SIZE;
+ VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
+
// Number of iterations per work-item.
- const uint width_ = row >> 2;
+ const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
// Calculate max of row
uint i = 0;
- for(; i < width_; i++)
+ for(; i < width; ++i)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+ VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ max_val_vec = max(data_max, max_val_vec);
}
#ifdef NON_MULTIPLE_OF_GRID_SIZE
// How many work-items needed to complete the computation.
//TODO: Optimize this calculation (avoid %).
- int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+ int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
if(lid < boundary_workitems)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+ VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ max_val_vec = max(data_max, max_val_vec);
}
#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- if(boundary_workitems == 0)
- {
- boundary_workitems = GRID_SIZE;
- i--;
- }
- if(lid == (boundary_workitems - 1))
+ SELECT_TYPE widx;
+ if(lid == 0)
{
// Handle non multiple of 4
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
- VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
- widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
- max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
+ VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+ widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
+ max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));
}
#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
#endif /* NON_MULTIPLE_OF_GRID_SIZE */
@@ -397,7 +332,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 128)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -405,7 +340,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 64)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -413,7 +348,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 32)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -421,7 +356,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 16)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -429,7 +364,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 8)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -437,7 +372,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 4)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -445,99 +380,84 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 2)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0)
{
- max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
- max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
- max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
- max_local = max_val_vec.s0;
+ max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
+ max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
}
barrier(CLK_LOCAL_MEM_FENCE);
/* Second section */
// Set sum vector
- VEC_DATA_TYPE(DATA_TYPE, 4)
- sum1D = 0;
+ VEC_TYPE sum1D = 0;
DATA_TYPE max_val = max_local;
// Shift values, exp and sum
- for(i = 0; i < width_; i++)
+ for(i = 0; i < width; ++i)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- data = SUB_OP(data, max_val, DATA_TYPE, 4);
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ data -= max_val;
#ifdef BETA
- data = MUL_OP(data, beta, DATA_TYPE, 4);
+ data *= beta;
#endif /* BETA */
#ifdef LOG_SOFTMAX
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
- data = EXP_OP(data, DATA_TYPE, 4);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ data = exp(data);
#else /* LOG_SOFTMAX */
- data = EXP_OP(data, DATA_TYPE, 4);
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+ data = exp(data);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
#endif /* LOG_SOFTMAX */
- sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+ sum1D += data;
}
#ifdef NON_MULTIPLE_OF_GRID_SIZE
//TODO: Optimize the calculation (avoid %).
- boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+ boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
if(lid < boundary_workitems)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- data = SUB_OP(data, max_val, DATA_TYPE, 4);
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ data -= max_val;
#ifdef BETA
- data = MUL_OP(data, beta, DATA_TYPE, 4);
+ data *= beta;
#endif /* BETA */
#ifdef LOG_SOFTMAX
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
- data = EXP_OP(data, DATA_TYPE, 4);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ data = exp(data);
#else /* LOG_SOFTMAX */
- data = EXP_OP(data, DATA_TYPE, 4);
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+ data = exp(data);
+ VSTORE(VECTOR_SIZE)
+ (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
#endif /* LOG_SOFTMAX */
- sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+ sum1D += data;
}
#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- if(boundary_workitems == 0)
- {
- boundary_workitems = GRID_SIZE;
- i--;
- }
- if(lid == (boundary_workitems - 1))
+ if(lid == 0)
{
// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
- data = SUB_OP(data, max_val, DATA_TYPE, 4);
+ VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+ data -= max_val;
#ifdef BETA
- data = MUL_OP(data, beta, DATA_TYPE, 4);
+ data *= beta;
#endif /* BETA */
#ifdef LOG_SOFTMAX
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
- data = EXP_OP(data, DATA_TYPE, 4);
- VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
- widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+ VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+ (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+ data = exp(data);
data = select(0, data, widx);
#else /* LOG_SOFTMAX */
- data = EXP_OP(data, DATA_TYPE, 4);
- VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
- widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+ data = exp(data);
data = select(0, data, widx);
- VSTORE(4)
- (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
+ VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+ (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
#endif /* LOG_SOFTMAX */
- sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+ sum1D += data;
}
#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
#endif /* NON_MULTIPLE_OF_GRID_SIZE */
@@ -549,7 +469,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 128)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -557,7 +477,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 64)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -565,7 +485,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 32)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -573,7 +493,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 16)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -581,7 +501,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 8)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -589,7 +509,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 4)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -597,16 +517,17 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
{
if(lid < 2)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+ tmp_local[lid] += tmp_local[lid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0)
{
- sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
- // Perform max reduction
- sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
- sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
- *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+ sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
+ // Perform sum reduction
+ *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
}
}
+
+#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
index 22b8df8f74..b7a6e00dfa 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
@@ -23,67 +23,107 @@
*/
#include "helpers_asymm.h"
-#define MAX_OP(x, y, type, size) max((x), (y))
-#define ADD_OP(x, y, type, size) ((x) + (y))
-#define SUB_OP(x, y, type, size) ((x) - (y))
+#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN)
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-#if VECTOR_SIZE == 2
-__constant uint2 idx__ = (uint2)(0, 1);
-#define asymm_mult(a, b) ASYMM_MULT(a, b, 2)
-#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 2)
-#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 2)
-
-#elif VECTOR_SIZE == 4
-__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
-#define asymm_mult(a, b) ASYMM_MULT(a, b, 4)
-#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 4)
-#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 4)
-
-#elif VECTOR_SIZE == 8
-__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
-#define asymm_mult(a, b) ASYMM_MULT(a, b, 8)
-#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 8)
-#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 8)
-
-#else /* VECTOR_SIZE DEFAULT */
-#define VECTOR_SIZE 16
-#define LOG_VECTOR_SIZE 4
-__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-#define asymm_mult(a, b) ASYMM_MULT(a, b, 16)
-#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 16)
-#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 16)
-
-#endif /* VECTOR_SIZE END */
-
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VECTOR_SIZE)
-#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
-#if defined(DIFF_MIN)
-
-VEC_INT mult_by_quantized_multiplier_serial(VEC_INT data)
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
+ *
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm_quantized(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(sum),
+ TENSOR3D_DECLARATION(dst))
{
+ const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+ Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+ // Load max value of 1D logits vector (row)
+ int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
+
+ // It will be better to calculate this in prev layer and pass here as parameter
+ uint sum_val_u = convert_uint(sum_val);
+ int headroom_plus_one = clz(sum_val_u);
+ int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
+ int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
+ VEC_INT shifted_sum_minus_one = shifted_sum_minus_one_1;
+ VEC_INT shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, VECTOR_SIZE);
+
+ // It was already calculated in prev layer, should be stored into tmp output and reused
+ VEC_INT data_diff = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr);
+ VEC_INT data_diff_mult = data_diff;
#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
if(INPUT_BETA_MULTIPLIER > 1)
{
- return asymm_mult(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER);
+ data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
}
#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
- return data;
+
+ VEC_INT data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+ data = ASYMM_MULT(shifted_scale, data, VECTOR_SIZE);
+ data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, VECTOR_SIZE);
+#ifdef QASYMM8_SIGNED
+ data += (VEC_INT)(MIN_VALUE);
+#endif /* QASYMM8_SIGNED */
+ data = select(MIN_VALUE, data, data_diff >= (VEC_INT)(DIFF_MIN));
+ VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
+
+ STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
-int4 mult_by_quantized_multiplier_parallel(int4 data)
+#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
+
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
+
+VEC_INT mult_by_quantized_multiplier(VEC_INT data)
{
#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
if(INPUT_BETA_MULTIPLIER > 1)
{
- return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 4);
+ return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
}
#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
return data;
@@ -92,9 +132,15 @@ int4 mult_by_quantized_multiplier_parallel(int4 data)
/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
* then gets the exponent of each element as sums all elements across each row.
*
- * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
*
* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -128,111 +174,89 @@ int4 mult_by_quantized_multiplier_parallel(int4 data)
* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in] width Input image width
*/
__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(maxo),
TENSOR3D_DECLARATION(dst),
- TENSOR3D_DECLARATION(sum),
- uint width)
+ TENSOR3D_DECLARATION(sum))
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
// Calculate max of row
- const uint width4 = width >> LOG_VECTOR_SIZE;
- for(uint i = 0; i < width4; i++)
- {
- VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
- max_val_vec = MAX_OP(data, max_val_vec, DATA_TYPE, 16);
- }
-
#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- // Handle non multiple of 16
VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
- VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
- VEC_UCHAR widx = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);
- max_val_vec = MAX_OP(max_val_vec, select(vec_min_val, data, widx), DATA_TYPE, 16);
+ VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+ VEC_INT widx = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
+ max_val_vec = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));
#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+ for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+ {
+ VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+ max_val_vec = max(data, max_val_vec);
+ }
+
// Perform max reduction
-#if VECTOR_SIZE == 16
- max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
-#endif /* VECTOR SIZE 16 END */
-#if VECTOR_SIZE >= 8
- max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
-#endif /* VECTOR SIZE 8 END */
-#if VECTOR_SIZE >= 4
- max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
-#endif /* VECTOR SIZE 4 END */
- max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
-
- // Store result
- *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
+ DATA_TYPE max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
+ *((__global DATA_TYPE *)maxo.ptr) = max_local;
// Second part
// Load max value of 1D logits vector (row)
- int max_val = convert_int(*((__global DATA_TYPE *)offset(&maxo, 0, 0)));
+ int max_val = convert_int(max_local);
// Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
VEC_INT sum1D = 0;
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+ VEC_INT data_fp = CONVERT(data, VEC_INT);
+ VEC_INT data_diff = data_fp - max_val;
+ VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+ data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+ data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+ VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+ (data_diff, 0, (__global int *)dst_addr);
+ data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+ sum1D += select(0, data_fp, widx);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
// Shift values, exp and sum
- for(uint i = 0; i < width4; i++)
+ for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
{
- VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+ VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
VEC_INT data_fp = CONVERT(data, VEC_INT);
VEC_INT data_diff = data_fp - max_val;
- VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
- data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
- data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
+ VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+ data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+ data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
VSTORE(VECTOR_SIZE)
- (data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+ (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));
sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
}
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- // Handle non multiple of 16
- data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
- VEC_INT data_fp = CONVERT(data, VEC_INT);
- VEC_INT data_diff = data_fp - max_val;
- VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
- data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
- data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
- VEC_INT widx_ = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);
- VSTORE(VECTOR_SIZE)
- (data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));
- data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
- sum1D = sum1D + select(0, data_fp, widx_);
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
// Perform sum reduction
-#if VECTOR_SIZE == 16
- sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
-#endif /* VECTOR SIZE 16 END */
-#if VECTOR_SIZE >= 8
- sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
-#endif /* VECTOR SIZE 8 END */
-#if VECTOR_SIZE >= 4
- sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
-#endif /* VECTOR SIZE 4 END */
- sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
-
- // Calculate and store result
- *((__global int *)sum.ptr) = sum1D.s0;
+ *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
}
/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
* then gets the exponent of each element as sums all elements across each row.
*
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
*
* @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -266,72 +290,59 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
* @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
* @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in] width Input image width
*/
__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(maxo),
TENSOR3D_DECLARATION(dst),
- TENSOR3D_DECLARATION(sum),
- uint width)
+ TENSOR3D_DECLARATION(sum))
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ const uint lid = get_local_id(0);
+ const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
- const uint4 idx4 = (uint4)(0, 1, 2, 3);
- const uint lid = get_local_id(0);
-
// Define one temporary vector per work-item.
- __local int4 tmp_local[GRID_SIZE];
+ __local VEC_INT tmp_local[GRID_SIZE];
__local DATA_TYPE max_local;
- VEC_DATA_TYPE(DATA_TYPE, 4)
- vec_min_val = (VEC_DATA_TYPE(DATA_TYPE, 4))(MIN_VALUE);
- VEC_DATA_TYPE(DATA_TYPE, 4)
- max_val_vec = vec_min_val;
+ VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
+ VEC_BASE max_val_vec = vec_min_val;
- // Number of elements per work-item.
- const uint row = width / GRID_SIZE;
// Number of iterations per work-item.
- const uint width_ = row >> 2;
+ const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
// Calculate max of row
uint i = 0;
- for(; i < width_; i++)
+ for(; i < width; ++i)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data_max = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+ VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ max_val_vec = max(data_max, max_val_vec);
}
#ifdef NON_MULTIPLE_OF_GRID_SIZE
// How many work-items needed to complete the computation.
//TODO: Optimize this calculation (avoid %).
- int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+ int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
if(lid < boundary_workitems)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data_max = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+ VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ max_val_vec = max(data_max, max_val_vec);
}
#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- if(boundary_workitems == 0)
- {
- boundary_workitems = GRID_SIZE;
- i--;
- }
- if(lid == (boundary_workitems - 1))
+ VEC_INT widx;
+ if(lid == 0)
{
// Handle non multiple of 4
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data_max = vload4(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
- VEC_DATA_TYPE(DATA_TYPE, 4)
- widx = CONVERT((((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width), VEC_DATA_TYPE(DATA_TYPE, 4));
- max_val_vec = MAX_OP(max_val_vec, select(vec_min_val, data_max, widx), DATA_TYPE, 4);
+ VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+ widx = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
+ max_val_vec = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));
}
#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
#endif /* NON_MULTIPLE_OF_GRID_SIZE */
- tmp_local[lid] = convert_int4(max_val_vec);
+ tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);
barrier(CLK_LOCAL_MEM_FENCE);
@@ -339,7 +350,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 128)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
+ tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -347,7 +358,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 64)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
+ tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -355,7 +366,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 32)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
+ tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -363,7 +374,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 16)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
+ tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -371,7 +382,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 8)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
+ tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -379,7 +390,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 4)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
+ tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -387,72 +398,64 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 2)
{
- tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
+ tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0)
{
- max_val_vec = MAX_OP(CONVERT((tmp_local[lid + 1]), VEC_DATA_TYPE(DATA_TYPE, 4)), CONVERT((tmp_local[lid]), VEC_DATA_TYPE(DATA_TYPE, 4)), DATA_TYPE, 4);
- max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
- max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
- max_local = max_val_vec.s0;
+ max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));
+ max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
}
barrier(CLK_LOCAL_MEM_FENCE);
/* Second section */
// Set sum vector
- int4 sum1D = 0;
- int max_val = convert_int(max_local);
+ VEC_INT sum1D = 0;
+ int max_val = convert_int(max_local);
// Shift values, exp and sum
- for(i = 0; i < width_; i++)
+ for(i = 0; i < width; ++i)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- int4 data_fp = convert_int4(data);
- int4 data_diff = data_fp - max_val;
- int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
- data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
- data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
- vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
- sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
+ VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ VEC_INT data_fp = CONVERT(data, VEC_INT);
+ VEC_INT data_diff = data_fp - max_val;
+ VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+ data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+ data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+ VSTORE(VECTOR_SIZE)
+ (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
+ sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
}
#ifdef NON_MULTIPLE_OF_GRID_SIZE
//TODO: Optimize the calculation (avoid %).
- boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+ boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
if(lid < boundary_workitems)
{
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
- int4 data_fp = convert_int4(data);
- int4 data_diff = data_fp - max_val;
- int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
- data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
- data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
- vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
- sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
+ VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+ VEC_INT data_fp = CONVERT(data, VEC_INT);
+ VEC_INT data_diff = data_fp - max_val;
+ VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+ data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+ data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+ VSTORE(VECTOR_SIZE)
+ (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
+ sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
}
#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
- if(boundary_workitems == 0)
- {
- boundary_workitems = GRID_SIZE;
- i--;
- }
- if(lid == (boundary_workitems - 1))
+ if(lid == 0)
{
// Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
- VEC_DATA_TYPE(DATA_TYPE, 4)
- data = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4 + 4, 0));
- int4 data_fp = convert_int4(data);
- int4 data_diff = data_fp - max_val;
- int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
- data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
- data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
- int4 widx = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
- vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));
- data_fp = select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
+ VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+ VEC_INT data_fp = CONVERT(data, VEC_INT);
+ VEC_INT data_diff = data_fp - max_val;
+ VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+ data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+ data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+ VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+ (data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));
+ data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
data_fp = select(0, data_fp, widx);
sum1D = sum1D + data_fp;
}
@@ -466,7 +469,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 128)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
+ tmp_local[lid] += tmp_local[lid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -474,7 +477,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 64)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
+ tmp_local[lid] += tmp_local[lid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -482,7 +485,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 32)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
+ tmp_local[lid] += tmp_local[lid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -490,7 +493,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 16)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
+ tmp_local[lid] += tmp_local[lid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -498,7 +501,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 8)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
+ tmp_local[lid] += tmp_local[lid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -506,7 +509,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 4)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
+ tmp_local[lid] += tmp_local[lid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -514,88 +517,16 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
{
if(lid < 2)
{
- tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
+ tmp_local[lid] += tmp_local[lid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0)
{
- sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);
- // Perform max reduction
- sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, int, 2);
- sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, int, 1);
- *((__global int *)sum.ptr) = sum1D.s0;
+ sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
+ // Perform sum reduction
+ *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
}
}
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- *
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void softmax_layer_norm_quantized(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(sum),
- TENSOR3D_DECLARATION(dst))
-{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
- // Load max value of 1D logits vector (row)
- int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
-
- // It will be better to calculate this in prev layer and pass here as parameter
- uint sum_val_u = convert_uint(sum_val);
- int headroom_plus_one = clz(sum_val_u);
- int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
- int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
- int16 shifted_sum_minus_one = shifted_sum_minus_one_1;
- int16 shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);
-
- // It was already calculated in prev layer, should be stored into tmp output and reused
- int16 data_diff = vload16(0, (__global int *)offset(&src, 0, 0));
- int16 data_diff_mult = data_diff;
-#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
- if(INPUT_BETA_MULTIPLIER > 1)
- {
- data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);
- }
-#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-
- int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);
- data = ASYMM_MULT(shifted_scale, data, 16);
- data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);
-#ifdef QASYMM8_SIGNED
- data = ADD_OP(data, (int16)(MIN_VALUE), int, 16);
-#endif /* QASYMM8_SIGNED */
- data = select(MIN_VALUE, data, data_diff >= (int16)(DIFF_MIN));
- vstore16(CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
-}
-
-#endif /* defined(DIFF_MIN) */
+#endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
+#endif /* defined(DATA_TYPE) && defined(DIFF_MIN) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(MIN_VALUE) */
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
index 6e969bd111..5e5b737785 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_input_transform.cl
@@ -23,48 +23,48 @@
*/
#include "helpers.h"
-#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(datatype, basename, y_cond, z_cond) \
- ({ \
- basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s0) && (z_cond))); \
- basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s1) && (z_cond))); \
- basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s2) && (z_cond))); \
- basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s3) && (z_cond))); \
- basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##1).s0) && (z_cond))); \
- basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##1).s1) && (z_cond))); \
+#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(datatype, basename, y_cond, z_cond) \
+ ({ \
+ basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \
+ basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \
+ basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \
+ basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \
+ basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s0) && (z_cond))); \
+ basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s1) && (z_cond))); \
})
-#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(datatype, basename, y_cond, z_cond) \
- ({ \
- basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s0))); \
- basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s1))); \
- basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s2))); \
- basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s3))); \
- basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##1).s0))); \
- basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##1).s1))); \
+#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(datatype, basename, y_cond, z_cond) \
+ ({ \
+ basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \
+ basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \
+ basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \
+ basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \
+ basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s0))); \
+ basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s1))); \
})
-#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(datatype, basename, y_cond, z_cond) \
- ({ \
- basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s0) && (z_cond))); \
- basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s1) && (z_cond))); \
- basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s2) && (z_cond))); \
- basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s3) && (z_cond))); \
- basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s4) && (z_cond))); \
- basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s5) && (z_cond))); \
- basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s6) && (z_cond))); \
- basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype, 1))(((y_cond##0).s7) && (z_cond))); \
+#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(datatype, basename, y_cond, z_cond) \
+ ({ \
+ basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \
+ basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \
+ basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \
+ basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \
+ basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s4) && (z_cond))); \
+ basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s5) && (z_cond))); \
+ basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s6) && (z_cond))); \
+ basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s7) && (z_cond))); \
})
-#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(datatype, basename, y_cond, z_cond) \
- ({ \
- basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s0))); \
- basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s1))); \
- basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s2))); \
- basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s3))); \
- basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s4))); \
- basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s5))); \
- basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s6))); \
- basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype, 1))((y_cond) && ((z_cond##0).s7))); \
+#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(datatype, basename, y_cond, z_cond) \
+ ({ \
+ basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \
+ basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \
+ basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \
+ basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \
+ basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s4))); \
+ basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s5))); \
+ basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s6))); \
+ basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s7))); \
})
#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
@@ -1000,7 +1000,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
const int b = get_global_id(2) / NUM_TILES_Y;
#else // defined(NUM_TILES_Y)
// Index height
- const int z = get_global_id(2);
+ const int z = get_global_id(2);
#endif // defined(NUM_TILES_Y)
#if defined(NUM_TILES_Y)
@@ -1064,12 +1064,12 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d0, y_cond, z_cond0.s0);
#else // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
- DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
- DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
- DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
- DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
- DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
+ DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+ DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+ DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+ DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+ DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
+ DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(DATA_TYPE, d0, y_cond0.s0, z_cond);
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
@@ -1135,7 +1135,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
#if defined(NUM_TILES_Y)
__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
#else // defined(NUM_TILES_Y)
- __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
+ __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
#endif // defined(NUM_TILES_Y)
uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
@@ -1354,14 +1354,14 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
const int z = get_global_id(2) % NUM_TILES_Y;
const int b = get_global_id(2) / NUM_TILES_Y;
#else // defined(NUM_TILES_Y)
- const int z = get_global_id(2);
+ const int z = get_global_id(2);
#endif // defined(NUM_TILES_Y)
// Compute input address
#if defined(NUM_TILES_Y)
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
#else // defined(NUM_TILES_Y)
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
#endif // defined(NUM_TILES_Y)
// Origin coordinates for the width (y) and height (z) in the input tensor
diff --git a/src/core/CL/cl_kernels/yolo_layer.cl b/src/core/CL/cl_kernels/yolo_layer.cl
index fe7b5cbb55..9601dddf67 100644
--- a/src/core/CL/cl_kernels/yolo_layer.cl
+++ b/src/core/CL/cl_kernels/yolo_layer.cl
@@ -25,7 +25,7 @@
#include "activation_float_helpers.h"
-#define SELECT_TYPE SELECT_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
#if VEC_SIZE != 1
#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index d9f498c522..526d9e187d 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -23,26 +23,14 @@
*/
#include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
#include "src/core/CL/CLValidate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
@@ -153,59 +141,6 @@ Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *su
return Status{};
}
-
-// Window validation
-
-std::pair<Status, Window> validate_and_configure_window_1DMaxShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
-{
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*sum, input->clone()->set_tensor_shape(max->tensor_shape()));
- auto_init_if_empty(*output, *input->clone());
-
- CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo parallel_reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(input->dimension(0));
- unsigned int vector_size = std::get<1>(parallel_reduction_info);
- const unsigned int num_elems_x = ceil_to_multiple(input->tensor_shape().x(), vector_size);
- Window win = calculate_max_window(*input, Steps(num_elems_x));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_x);
- AccessWindowHorizontal max_access(max, 0, 1);
- AccessWindowHorizontal output_access(output, 0, num_elems_x);
- AccessWindowHorizontal sum_access(sum, 0, 1);
-
- bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
-
- output_access.set_valid_region(win, input->valid_region());
- sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_1DNorm(ITensorInfo *input, ITensorInfo *output, ITensorInfo *sum, const SoftmaxKernelInfo &info)
-{
- const DataType output_data_type = info.input_data_type;
- const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output,
- input->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
-
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowStatic sum_access(sum, 0, 0, 1, sum->dimension(1));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
-
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
} // namespace
/**< Grid size (obtained through auto-tuning) */
@@ -229,6 +164,8 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
+ auto padding_info = get_padding_info({ input, max, output, sum });
+
// Output auto initialization if not yet initialized
auto_init_if_empty(*sum->info(), input->info()->clone()->set_tensor_shape(max->info()->tensor_shape()));
auto_init_if_empty(*output->info(), *input->info()->clone());
@@ -248,30 +185,31 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c
const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
+ ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
+ const unsigned int vector_size = adjust_vec_size(std::get<1>(parallel_reduction_info), reduction_dim_size);
+
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(reduction_dim_size));
+ build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(reduction_dim_size % vector_size));
+ build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
+ build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
- build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+ build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
+ build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX")));
build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
- build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
cl::NDRange lws_hint(cl::NullRange);
- std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_serial") :
- std::string("softmax_layer_max_shift_exp_sum_serial");
- ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
- unsigned int vector_size = std::get<1>(parallel_reduction_info);
-
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
- build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
- build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
+ std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "");
// Configure parallel kernel if needed
if(std::get<0>(parallel_reduction_info))
{
- kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_parallel") : std::string("softmax_layer_max_shift_exp_sum_parallel");
+ kernel_name += "parallel";
bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
@@ -282,25 +220,24 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c
// A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
lws_hint = cl::NDRange(_grid_size);
}
+ else
+ {
+ kernel_name += "serial";
+ }
// Create kernel.
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- // Set static arguments. Both the kernels use the same arguments
- unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
- _kernel.setArg<cl_uint>(idx++, reduction_dim_size);
-
// Configure window
- auto win_config = validate_and_configure_window_1DMaxShiftExpSum(input->info(), max->info(), output->info(), sum->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second, lws_hint);
+ Window win = calculate_max_window(*(input->info()), Steps(reduction_dim_size));
+ ICLKernel::configure_internal(win, lws_hint);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status CLLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(input, max, output, sum));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMaxShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
-
return Status{};
}
@@ -323,9 +260,8 @@ void CLLogits1DMaxShiftExpSumKernel::run(const Window &window, cl::CommandQueue
ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(_input->info()->dimension(0));
if(std::get<0>(parallel_reduction_info))
{
- // To launch grid_size parallel workitems, steps.x should be modified as follows.
- const unsigned int step = std::get<1>(parallel_reduction_info);
- window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size * step, step));
+ // Launch grid_size parallel work items
+ window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
}
// Get slices
@@ -357,6 +293,8 @@ void CLLogits1DNormKernel::configure(const CLCompileContext &compile_context, co
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
+ auto padding_info = get_padding_info({ input, output, sum });
+
// Note: output should always have a scale of 1/256 and offset 0
const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
const DataType output_data_type = info.input_data_type;
@@ -374,32 +312,35 @@ void CLLogits1DNormKernel::configure(const CLCompileContext &compile_context, co
_sum = sum;
_output = output;
- const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
- const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
+ const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
+ const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
+ const unsigned int vector_size = adjust_vec_size(16, input->info()->dimension(0));
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type));
build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+ build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED");
build_opts.add_options_if(is_quantized_asymmetric,
prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options());
build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
// Create kernel
- std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_norm_quantized" : "softmax_layer_norm";
+ std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : "");
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Configure window
- auto win_config = validate_and_configure_window_1DNorm(input->info(), output->info(), sum->info(), info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ auto win = calculate_max_window(*(input->info()), Steps(vector_size));
+ ICLKernel::configure_internal(win);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status CLLogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(input, sum, output, info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DNorm(input->clone().get(), output->clone().get(), sum->clone().get(), info).first);
return Status{};
}
@@ -426,3 +367,4 @@ void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
}
while(window_collapsed.slide_window_slice_3D(slice));
}
+} // namespace arm_compute \ No newline at end of file
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index fe31b00e00..396e274e0b 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -69,8 +69,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes
TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
QuantizationInfo(1.f/256, 12)),
- TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Window shrink
- TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),// Invalid input dimensionality
TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
QuantizationInfo(1.f/256, 12)),
@@ -85,8 +83,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
QuantizationInfo(1.f/256, 12)),
- TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),
- TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
QuantizationInfo(1.f/256, 0)),
@@ -105,22 +101,18 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
2.0,
1.0,
2.0,
- 1.0,
- 2.0,
})),
framework::dataset::make("axis", {
0,
0,
0,
- 0,
- 0,
1,
0,
-1,
2,
-3,
})),
- framework::dataset::make("Expected", { false, false, false, false, false, true, true, true, false, false })),
+ framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })),
input_info, output_info, beta, axis, expected)
{
ARM_COMPUTE_EXPECT(bool(CLSoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);