aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL
diff options
context:
space:
mode:
authorVidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>2018-07-04 09:34:00 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:10 +0000
commit7485d5a62685cb745ab50e970adb722cb71557ac (patch)
treeba01b99ca466c93edc9a3f8c1e34394ff84be060 /src/core/CL
parent014333d73883c3872e458cedda5ccef586a7ccd4 (diff)
downloadComputeLibrary-7485d5a62685cb745ab50e970adb722cb71557ac.tar.gz
COMPMID-970 : Remove QS8 / QS16 support
Removed fixed point related code. Change-Id: I487acf138dace3b0450e0d72ca7071eaec254566 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137678 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/CL')
-rw-r--r--src/core/CL/CLHelpers.cpp10
-rw-r--r--src/core/CL/CLKernelLibrary.cpp10
-rw-r--r--src/core/CL/cl_kernels/activation_layer.cl22
-rw-r--r--src/core/CL/cl_kernels/arithmetic_op.cl10
-rw-r--r--src/core/CL/cl_kernels/batchnormalization_layer.cl17
-rw-r--r--src/core/CL/cl_kernels/channel_shuffle.cl2
-rw-r--r--src/core/CL/cl_kernels/col2im.cl42
-rw-r--r--src/core/CL/cl_kernels/concatenate.cl4
-rw-r--r--src/core/CL/cl_kernels/convert_fc_weights.cl2
-rw-r--r--src/core/CL/cl_kernels/convolution_layer.cl4
-rw-r--r--src/core/CL/cl_kernels/depth_convert.cl37
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution.cl4
-rw-r--r--src/core/CL/cl_kernels/dequantization_layer.cl4
-rw-r--r--src/core/CL/cl_kernels/direct_convolution1x1.cl14
-rw-r--r--src/core/CL/cl_kernels/direct_convolution3x3.cl17
-rw-r--r--src/core/CL/cl_kernels/fill_border.cl4
-rw-r--r--src/core/CL/cl_kernels/fixed_point.h518
-rw-r--r--src/core/CL/cl_kernels/gemm.cl705
-rw-r--r--src/core/CL/cl_kernels/im2col.cl36
-rw-r--r--src/core/CL/cl_kernels/l2_normalize.cl6
-rw-r--r--src/core/CL/cl_kernels/normalization_layer.cl26
-rw-r--r--src/core/CL/cl_kernels/permute.cl6
-rw-r--r--src/core/CL/cl_kernels/pixelwise_mul_int.cl21
-rw-r--r--src/core/CL/cl_kernels/pooling_layer.cl48
-rw-r--r--src/core/CL/cl_kernels/reshape_layer.cl4
-rw-r--r--src/core/CL/cl_kernels/softmax_layer.cl28
-rw-r--r--src/core/CL/cl_kernels/softmax_layer_quantized.cl4
-rw-r--r--src/core/CL/kernels/CLActivationLayerKernel.cpp21
-rw-r--r--src/core/CL/kernels/CLArithmeticAdditionKernel.cpp14
-rw-r--r--src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp14
-rw-r--r--src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp8
-rw-r--r--src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp4
-rw-r--r--src/core/CL/kernels/CLCol2ImKernel.cpp6
-rw-r--r--src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp5
-rw-r--r--src/core/CL/kernels/CLDepthConvertLayerKernel.cpp18
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp1
-rw-r--r--src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp1
-rw-r--r--src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp1
-rw-r--r--src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLDequantizationLayerKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp13
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.cpp8
-rw-r--r--src/core/CL/kernels/CLFloorKernel.cpp4
-rw-r--r--src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp5
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp5
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp17
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp16
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp1
-rw-r--r--src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp5
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp6
-rw-r--r--src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp3
-rw-r--r--src/core/CL/kernels/CLMinMaxLayerKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLNormalizationLayerKernel.cpp19
-rw-r--r--src/core/CL/kernels/CLPermuteKernel.cpp5
-rw-r--r--src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp31
-rw-r--r--src/core/CL/kernels/CLPoolingLayerKernel.cpp9
-rw-r--r--src/core/CL/kernels/CLQuantizationLayerKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLROIPoolingLayerKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.cpp7
-rw-r--r--src/core/CL/kernels/CLReshapeLayerKernel.cpp7
-rw-r--r--src/core/CL/kernels/CLSoftmaxLayerKernel.cpp15
-rw-r--r--src/core/CL/kernels/CLTransposeKernel.cpp5
-rw-r--r--src/core/CL/kernels/CLWeightsReshapeKernel.cpp5
-rw-r--r--src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp3
66 files changed, 128 insertions, 1773 deletions
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index df06aff647..07f8bd7bcd 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -38,8 +38,6 @@ std::string get_cl_type_from_data_type(const DataType &dt)
{
case DataType::U8:
return "uchar";
- case DataType::QS8:
- return "qs8";
case DataType::S8:
return "char";
case DataType::QASYMM8:
@@ -48,8 +46,6 @@ std::string get_cl_type_from_data_type(const DataType &dt)
return "ushort";
case DataType::S16:
return "short";
- case DataType::QS16:
- return "qs16";
case DataType::U32:
return "uint";
case DataType::S32:
@@ -75,13 +71,11 @@ std::string get_data_size_from_data_type(const DataType &dt)
switch(dt)
{
case DataType::U8:
- case DataType::QS8:
case DataType::S8:
case DataType::QASYMM8:
return "8";
case DataType::U16:
case DataType::S16:
- case DataType::QS16:
case DataType::F16:
return "16";
case DataType::U32:
@@ -101,10 +95,6 @@ std::string get_underlying_cl_type_from_data_type(const DataType &dt)
{
switch(dt)
{
- case DataType::QS8:
- return "char";
- case DataType::QS16:
- return "short";
case DataType::QS32:
return "int";
default:
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index db4b344935..42cf21350d 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -231,22 +231,16 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "gemm_interleave4x4", "gemm.cl" },
{ "gemm_ma_f16", "gemm.cl" },
{ "gemm_ma_f32", "gemm.cl" },
- { "gemm_ma_qs8", "gemm.cl" },
- { "gemm_ma_qs16", "gemm.cl" },
{ "gemm_mv", "gemv.cl" },
{ "gemm_mv_quantized", "gemv.cl" },
{ "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f32", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
- { "gemm_mm_interleaved_transposed_qs8", "gemm.cl" },
- { "gemm_mm_interleaved_transposed_qs16", "gemm.cl" },
{ "gemm_mm_floating_point", "gemm.cl" },
{ "gemm_mm_floating_point_f16_bifrost", "gemm.cl" },
{ "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
{ "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
- { "gemm_mm_qs8", "gemm.cl" },
- { "gemm_mm_qs16", "gemm.cl" },
{ "gemm_lc_vm_f32", "gemm.cl" },
{ "gemm_transpose1xW", "gemm.cl" },
{ "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
@@ -557,10 +551,6 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
#include "./cl_kernels/fill_border.clembed"
},
{
- "fixed_point.h",
-#include "./cl_kernels/fixed_point.hembed"
- },
- {
"floor.cl",
#include "./cl_kernels/floor.clembed"
},
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index a8ea7387d6..373406a6da 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -25,23 +25,6 @@
#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-
-#define CONST_ONE (1 << FIXED_POINT_POSITION)
-#define ABS_OP(a) ABS_SAT_OP_EXPAND((a), DATA_TYPE, VEC_SIZE)
-#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
-#define SUB_OP(a, b) SUB_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
-#define MUL_OP(a, b) MUL_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define MLA_OP(a, b, c) MLA_SAT_OP_EXPAND((a), (b), (c), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define DIV_OP(a, b) DIV_SAT_OP_VEC_EXPAND((a), (b), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define EXP_OP(a) EXP_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define LOG_OP(a) LOG_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define SQRT_OP(a) DIV_OP(CONST_ONE, INVSQRT_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION))
-#define TANH_OP(a) TANH_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-
-#else /* FIXED_POINT_POSITION */
-
#define CONST_ONE 1.f
#define ABS_OP(a) fabs((a))
#define ADD_OP(a, b) ((a) + (b))
@@ -54,8 +37,6 @@
#define SQRT_OP(a) sqrt((a))
#define TANH_OP(a) tanh((a))
-#endif /* FIXED_POINT_POSITION */
-
// Logistic Activation
inline TYPE logistic_op(TYPE x)
{
@@ -125,9 +106,8 @@ inline TYPE linear_op(TYPE x)
* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
* @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- * @note In case of fixed point calculations the fixed point position is passed using -DFIXED_POINT_POSITION=position. e.g. -DFIXED_POINT_POSITION=3.
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
index 8bd28230b7..9efb71b199 100644
--- a/src/core/CL/cl_kernels/arithmetic_op.cl
+++ b/src/core/CL/cl_kernels/arithmetic_op.cl
@@ -23,10 +23,6 @@
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-#endif /* FIXED_POINT_POSITION */
-
#ifdef SATURATE
#define ADD(x, y) add_sat((x), (y))
#define SUB(x, y) sub_sat((x), (y))
@@ -43,7 +39,7 @@
* e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
* @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
*
- * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/QS8/QS16/S16/F16/F32
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32
* @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -51,7 +47,7 @@
* @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8/QS8 (only if @p in1_ptr is QS8), QS16 (only if @p in1_ptr is QS16), S16/F16/F32
+ * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32
* @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -59,7 +55,7 @@
* @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
* @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 9c980da62a..5352af3c5a 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -25,25 +25,12 @@
#if defined(VEC_SIZE) && defined(DATA_TYPE)
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-
-#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
-#define SUB_OP(a, b) SUB_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
-#define MUL_OP(a, b) MUL_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define INVSQRT_OP(a) INVSQRT_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define SQCVT_SAT(a) SQCVT_SAT_OP_EXPAND((a), DATA_TYPE, FIXED_POINT_POSITION)
-
-#else /* FIXED_POINT_POSITION */
-
#define ADD_OP(a, b) ((a) + (b))
#define SUB_OP(a, b) ((a) - (b))
#define MUL_OP(a, b) ((a) * (b))
#define INVSQRT_OP(a) rsqrt((a))
#define SQCVT_SAT(a) (a)
-#endif /* FIXED_POINT_POSITION */
-
#if defined(FUSED_ACTIVATION)
#include "activation_layer.cl"
#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
@@ -53,7 +40,7 @@
/** Apply batch normalization.
*
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
@@ -163,7 +150,7 @@ __kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
/** Apply batch normalization on tensors with NHWC format.
*
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
index 26cee9ccdd..23962e1c2e 100644
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ b/src/core/CL/cl_kernels/channel_shuffle.cl
@@ -38,7 +38,7 @@
* @note The number of channels in each group should be given as a preprocessor argument using -DK=num. e.g. -DK=1
* K is equal to num_channels / num_groups.
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl
index 6e491f33cf..98bf8d1ed4 100644
--- a/src/core/CL/cl_kernels/col2im.cl
+++ b/src/core/CL/cl_kernels/col2im.cl
@@ -23,12 +23,7 @@
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-#endif // FIXED_POINT_POSITION
-
#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)
-#if !defined(FIXED_POINT_POSITION)
#if ELEMENT_SIZE == 1
#define COND_DATA_TYPE char
@@ -100,41 +95,4 @@ __kernel void col2im(
*((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s6 * dst_stride_z)) = data.s6;
*((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s7 * dst_stride_z)) = data.s7;
}
-#else // !defined(FIXED_POINT_POSITION)
-/** This kernel performs a reshaping of the output of the convolution layer.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=qs8
- * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=320
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void col2im(
- IMAGE_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
- uint dst_stride_w)
-{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
-
- // Compute output offset
- int idx = get_global_id(0) * dst.stride_z + (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w;
-
- // Store value
- *((__global DATA_TYPE *)(dst.ptr + idx)) = *((__global DATA_TYPE *)(src.ptr));
-}
-#endif // !defined(FIXED_POINT_POSITION)
#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index f97ae13a9a..6ec8383c52 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -25,7 +25,7 @@
/** This kernel concatenates the input tensor into the output tensor along the first dimension
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8, QASYMM8, QS16, F16, F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8, F16, F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -60,7 +60,7 @@ __kernel void concatenate_width(
/** This kernel concatenates the input tensor into the output tensor along the third dimension
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8, QS16, F16, F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16, F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/convert_fc_weights.cl
index 3c3e8b0dc4..5aadfb36f9 100644
--- a/src/core/CL/cl_kernels/convert_fc_weights.cl
+++ b/src/core/CL/cl_kernels/convert_fc_weights.cl
@@ -32,7 +32,7 @@
* @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
* @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: U8, S8, QS8, QASYMM8, U16, S16, QS16, U32, S32, QS32, F16, F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: U8, S8, QASYMM8, U16, S16, U32, S32, QS32, F16, F32
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index 6a70b009c8..2b83e5adf1 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -23,10 +23,6 @@
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-#endif // FIXED_POINT_POSITION
-
#if defined(DATA_TYPE)
/** This kernel reshapes the tensor's low three dimensions to single column
*
diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl
index a9b7284c83..01491ec1b7 100644
--- a/src/core/CL/cl_kernels/depth_convert.cl
+++ b/src/core/CL/cl_kernels/depth_convert.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,23 +23,6 @@
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-
-#include "fixed_point.h"
-
-#ifdef SATURATE
-#define CONVERT_DOWN(x, in_type, out_type, fixed_point_position) CONVERT_DOWN1_SAT(x, in_type, out_type, fixed_point_position)
-#define CONVERT_DOWN1_SAT(x, in_type, out_type, fixed_point_position) convert_##out_type##_##in_type##_sat(x, fixed_point_position)
-#else /* SATURATE */
-#define CONVERT_DOWN(x, in_type, out_type, fixed_point_position) CONVERT_DOWN1(x, in_type, out_type, fixed_point_position)
-#define CONVERT_DOWN1(x, in_type, out_type, fixed_point_position) convert_##out_type##_##in_type(x, fixed_point_position)
-#endif /* SATURATE */
-
-#define CONVERT_UP(x, in_type, out_type, fixed_point_position) CONVERT_UP1(x, in_type, out_type, fixed_point_position)
-#define CONVERT_UP1(x, in_type, out_type, fixed_point_position) convert_##out_type##_##in_type(x, fixed_point_position)
-
-#else /* FIXED_POINT_POSITION */
-
#ifdef SATURATE
#define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
#else /* SATURATE */
@@ -48,22 +31,18 @@
#define CONVERT_UP(x, type) CONVERT(x, type)
-#endif /* FIXED_POINT_POSITION */
-
/** This function performs a down-scaling depth conversion.
*
* @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
*
- * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
- *
* @param[in] in_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F16, F32
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in_step_x in_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in_step_y in_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: QS8, U8, QS16, U16, S16, U32, S32
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, U16, S16, U32, S32
* @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
@@ -84,11 +63,7 @@ __kernel void convert_depth_down(
VEC_DATA_TYPE(DATA_TYPE_IN, 16)
in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
-#if defined(FIXED_POINT_POSITION)
- vstore16(CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_IN, 16), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), FIXED_POINT_POSITION), 0, (__global DATA_TYPE_OUT *)out.ptr);
-#else /* FIXED_POINT_POSITION */
vstore16(CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
-#endif /* FIXED_POINT_POSITION */
}
/** This function performs a up-scaling depth conversion.
@@ -96,9 +71,7 @@ __kernel void convert_depth_down(
* @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
*
- * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
- *
- * @param[in] in_ptr Pointer to the source image. Supported data types: U8, QS8, U16, S16, QS16, U32 or S32
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in_step_x in_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
@@ -125,9 +98,5 @@ __kernel void convert_depth_up(
VEC_DATA_TYPE(DATA_TYPE_IN, 16)
in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
-#if defined(FIXED_POINT_POSITION)
- vstore16(CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_IN, 16), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), FIXED_POINT_POSITION), 0, (__global DATA_TYPE_OUT *)out.ptr);
-#else /* FIXED_POINT_POSITION */
vstore16(CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)) << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
-#endif /* FIXED_POINT_POSITION */
}
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index f3aa0d6dd8..9a8b57e4c4 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -527,7 +527,7 @@ __kernel void depthwise_weights_reshape(
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -587,7 +587,7 @@ __kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(d
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
index 21e9c873ac..4908bb0b31 100644
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ b/src/core/CL/cl_kernels/dequantization_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
/** This performs the dequantization of 8-bit unsigned integers to floating point.
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
index 817c261ba2..7a308c99e2 100644
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ b/src/core/CL/cl_kernels/direct_convolution1x1.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,24 +23,12 @@
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-
-#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE_PROMOTED, 8)
-#define MUL_OP(a, b) MUL_SAT_OP_EXPAND(CONVERT((a), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), CONVERT((b), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), DATA_TYPE_PROMOTED, 8, FIXED_POINT_POSITION)
-
-// There is no need to have a larger intermediate type for qs32 because all the arguments are already promoted
-MULQ_SAT_IMPL(qs32x8, qs32x8)
-
-#else /* FIXED_POINT_POSITION */
#undef CONVERT_SAT
#define ADD_OP(a, b) ((a) + (b))
#define MUL_OP(a, b) ((a) * (b))
#define CONVERT_SAT(a, b) ((a))
-#endif /* FIXED_POINT_POSITION */
-
#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
#if STRIDE_X == 3
diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
index a7abc9ff1d..824306f2ba 100644
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ b/src/core/CL/cl_kernels/direct_convolution3x3.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,25 +23,12 @@
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-
-#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE_PROMOTED, 8)
-#define MUL_OP(a, b) MUL_SAT_OP_EXPAND(CONVERT((a), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), CONVERT((b), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), DATA_TYPE_PROMOTED, 8, FIXED_POINT_POSITION)
-
-// There is no need to have a larger intermediate type for qs32 because all the arguments are already promoted
-MULQ_SAT_IMPL(qs32x8, qs32x8)
-
-#else /* FIXED_POINT_POSITION */
-
#undef CONVERT_SAT
#define ADD_OP(a, b) ((a) + (b))
#define MUL_OP(a, b) ((a) * (b))
#define CONVERT_SAT(a, b) ((a))
-#endif /* FIXED_POINT_POSITION */
-
#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
#if STRIDE_X == 1
@@ -86,7 +73,7 @@ MULQ_SAT_IMPL(qs32x8, qs32x8)
* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
* @note If biases are used then -DHAS_BIAS has to be passed at compile time
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
index 33a9495d66..9d6a2b8b5a 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/fill_border.cl
@@ -23,10 +23,6 @@
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-#endif /* FIXED_POINT_POSITION */
-
/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
*
* @attention The DATA_TYPE needs to be passed at the compile time.
diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
deleted file mode 100644
index 46fa645c2b..0000000000
--- a/src/core/CL/cl_kernels/fixed_point.h
+++ /dev/null
@@ -1,518 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_FIXED_POINT_H
-#define ARM_COMPUTE_FIXED_POINT_H
-
-#define TYPE_ALIAS(type, alias) \
- typedef type alias; \
- typedef type alias##x##1; \
- typedef type##2 alias##x##2; \
- typedef type##3 alias##x##3; \
- typedef type##4 alias##x##4; \
- typedef type##8 alias##x##8; \
- typedef type##16 alias##x##16;
-
-TYPE_ALIAS(char, qs8)
-TYPE_ALIAS(short, qs16)
-TYPE_ALIAS(int, qs32)
-
-#define qs8_MIN ((char)CHAR_MIN)
-#define qs8_MAX ((char)CHAR_MAX)
-#define qs16_MIN ((short)SHRT_MIN)
-#define qs16_MAX ((short)SHRT_MAX)
-#define qs32_MIN ((int)INT_MIN)
-#define qs32_MAX ((int)INT_MAX)
-
-#define qu8_MIN ((uchar)0)
-#define qu8_MAX ((uchar)UCHAR_MAX)
-#define qu16_MIN ((ushort)0)
-#define qu16_MAX ((ushort)USHRT_MAX)
-#define qu32_MIN ((uint)0)
-#define qu32_MAX ((uint)UINT_MAX)
-
-#define qs8_TYPE char
-#define qs8x1_TYPE char
-#define qs8x2_TYPE char2
-#define qs8x3_TYPE char3
-#define qs8x4_TYPE char4
-#define qs8x8_TYPE char8
-#define qs8x16_TYPE char16
-
-#define qs16_TYPE short
-#define qs16x1_TYPE short
-#define qs16x2_TYPE short2
-#define qs16x3_TYPE short3
-#define qs16x4_TYPE short4
-#define qs16x8_TYPE short8
-#define qs16x16_TYPE short16
-
-#define qs32_TYPE int
-#define qs32x1_TYPE int
-#define qs32x2_TYPE int2
-#define qs32x3_TYPE int3
-#define qs32x4_TYPE int4
-#define qs32x8_TYPE int8
-#define qs32x16_TYPE int16
-
-/* All internal constants are represented in the maximum supported fixed point format (QS16),
- * thus we define an additional shift parameter required to convert the constant
- * from the maximum supported format to the require one.
- */
-#define qs8_SHIFT 8
-#define qs16_SHIFT 0
-
-#undef VEC_DATA_TYPE_STR
-#undef VEC_DATA_TYPE
-#undef CONVERT_STR
-#undef CONVERT
-#undef CONVERT_SAT_STR
-#undef CONVERT_SAT
-
-#define VEC_DATA_TYPE_STR(type, size) type##x##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
-
-#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x)))
-#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype)
-#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE)
-#define CONVERT(x, type) CONVERT_STR(x, type)
-
-#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x)))
-#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype)
-#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE)
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-/** Computes saturating absolute value of fixed point vector.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point absolute value.
- */
-#define ABSQ_SAT_IMPL(type) \
- inline type abs_##type##_sat(type VopA) \
- { \
- return CONVERT_SAT(abs(VopA), type); \
- }
-
-ABSQ_SAT_IMPL(qs8x16)
-ABSQ_SAT_IMPL(qs16x8)
-
-#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a))
-#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size)
-
-/** Computes max of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point maximum.
- */
-#define MAXQ_IMPL(type) \
- inline type max_##type(type VopA, type VopB) \
- { \
- return max(VopA, VopB); \
- }
-
-MAXQ_IMPL(qs8x1)
-MAXQ_IMPL(qs8x2)
-MAXQ_IMPL(qs8x4)
-MAXQ_IMPL(qs8x8)
-MAXQ_IMPL(qs8x16)
-MAXQ_IMPL(qs16x1)
-MAXQ_IMPL(qs16x2)
-MAXQ_IMPL(qs16x4)
-MAXQ_IMPL(qs16x8)
-MAXQ_IMPL(qs16x16)
-
-#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b))
-#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size)
-
-/** Computes saturated addition of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point addition. The result is saturated in case of overflow
- */
-#define ADDQ_SAT_IMPL(type) \
- inline type add_sat_##type(type VopA, type VopB) \
- { \
- return add_sat(VopA, VopB); \
- }
-
-ADDQ_SAT_IMPL(qs8x1)
-ADDQ_SAT_IMPL(qs8x2)
-ADDQ_SAT_IMPL(qs8x4)
-ADDQ_SAT_IMPL(qs8x8)
-ADDQ_SAT_IMPL(qs8x16)
-ADDQ_SAT_IMPL(qs16x1)
-ADDQ_SAT_IMPL(qs16x2)
-ADDQ_SAT_IMPL(qs16x4)
-ADDQ_SAT_IMPL(qs16x8)
-ADDQ_SAT_IMPL(qs16x16)
-ADDQ_SAT_IMPL(qs32x1)
-ADDQ_SAT_IMPL(qs32x2)
-ADDQ_SAT_IMPL(qs32x4)
-ADDQ_SAT_IMPL(qs32x8)
-ADDQ_SAT_IMPL(qs32x16)
-
-#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b))
-#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size)
-
-/** Computes saturated subtraction of fixed point types.
- *
- * @param[in] type the actual data type.
- *
- * @return The result of the fixed point subtraction. The result is saturated in case of overflow
- */
-#define SUBQ_SAT_IMPL(type) \
- inline type sub_sat_##type(type VopA, type VopB) \
- { \
- return sub_sat(VopA, VopB); \
- }
-
-SUBQ_SAT_IMPL(qs8x1)
-SUBQ_SAT_IMPL(qs8x2)
-SUBQ_SAT_IMPL(qs8x4)
-SUBQ_SAT_IMPL(qs8x8)
-SUBQ_SAT_IMPL(qs8x16)
-SUBQ_SAT_IMPL(qs16x1)
-SUBQ_SAT_IMPL(qs16x2)
-SUBQ_SAT_IMPL(qs16x4)
-SUBQ_SAT_IMPL(qs16x8)
-SUBQ_SAT_IMPL(qs16x16)
-
-#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b))
-#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size)
-
-/* Multiply of two fixed point numbers
- *
- * @param[in] type the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiplication.
- */
-#define MULQ_IMPL(type, itype) \
- inline type mul_##type(type VopA, type VopB, int fixed_point_position) \
- { \
- itype round_val = (itype)(1 << (fixed_point_position - 1)); \
- itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \
- return CONVERT((res >> (itype)fixed_point_position), type); \
- }
-
-MULQ_IMPL(qs8x8, qs16x8)
-MULQ_IMPL(qs16x8, qs32x8)
-MULQ_IMPL(qs8x16, qs16x16)
-MULQ_IMPL(qs16x16, qs32x16)
-
-#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position))
-#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position)
-
-/* Saturate multiply of two fixed point numbers
- *
- * @param[in] type the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiplication. The result is saturated in case of overflow
- */
-#define MULQ_SAT_IMPL(type, itype) \
- inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position) \
- { \
- itype round_val = (itype)(1 << (fixed_point_position - 1)); \
- itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \
- return CONVERT_SAT((res >> (itype)fixed_point_position), type); \
- }
-
-MULQ_SAT_IMPL(qs8x1, qs16x1)
-MULQ_SAT_IMPL(qs8x2, qs16x2)
-MULQ_SAT_IMPL(qs8x3, qs16x3)
-MULQ_SAT_IMPL(qs8x4, qs16x4)
-MULQ_SAT_IMPL(qs8x8, qs16x8)
-MULQ_SAT_IMPL(qs8x16, qs16x16)
-MULQ_SAT_IMPL(qs16x1, qs32x1)
-MULQ_SAT_IMPL(qs16x2, qs32x2)
-MULQ_SAT_IMPL(qs16x3, qs32x3)
-MULQ_SAT_IMPL(qs16x4, qs32x4)
-MULQ_SAT_IMPL(qs16x8, qs32x8)
-MULQ_SAT_IMPL(qs16x16, qs32x16)
-
-#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) mul_sat_##type##x##size((a), (b), (position))
-#define MUL_SAT_OP_EXPAND(a, b, type, size, position) MUL_SAT_OP_EXPAND_STR(a, b, type, size, position)
-
-/** Saturate multiply-accumulate
- *
- * @param[in] type the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiply-accumulate. The result is saturated in case of overflow
- */
-#define MLAQ_SAT_IMPL(type, itype) \
- type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \
- { \
- itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), (itype)(1 << (fixed_point_position - 1))); \
- return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \
- }
-
-MLAQ_SAT_IMPL(qs8x8, qs16x8)
-MLAQ_SAT_IMPL(qs8x16, qs16x16)
-MLAQ_SAT_IMPL(qs16x8, qs32x8)
-
-#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) mla_sat_##type##x##size((a), (b), (c), (position))
-#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
-
-/** Saturate multiply-accumulate long
- *
- * @param[in] type the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point multiply-accumulate long. The result is saturated in case of overflow
- */
-#define MLALQ_SAT_IMPL(type, itype) \
- itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \
- { \
- itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), (itype)(1 << (fixed_point_position - 1))); \
- return add_sat(VopA, res >> (itype)fixed_point_position); \
- }
-
-MLALQ_SAT_IMPL(qs8x8, qs16x8)
-MLALQ_SAT_IMPL(qs16x8, qs32x8)
-
-#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) mlal_sat_##type##x##size((a), (b), (c), (position))
-#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
-
-/** Saturate division of two fixed point vectors
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type the actual data type.
- * @param[in] itype the intermediate data type.
- *
- * @return The result of the fixed point division. The result is saturated in case of overflow
- */
-#define DIVQ_SAT_IMPL(stype, type, itype) \
- inline type div_sat_##type(type VopA, type VopB, int fixed_point_position) \
- { \
- itype conv_a = CONVERT((VopA), itype); \
- itype denominator = CONVERT((VopB), itype); \
- itype numerator = conv_a << (itype)(fixed_point_position); \
- itype res = select((itype)(numerator / denominator), select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), (itype)(denominator == (itype)0)); \
- return CONVERT_SAT((res), type); \
- }
-
-DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16)
-DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8)
-DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16)
-DIVQ_SAT_IMPL(qs8, qs8, qs16)
-DIVQ_SAT_IMPL(qs16, qs16, qs32)
-
-#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position))
-#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position)
-
-#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) div_sat_##type##x##size((a), (b), (position))
-#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position)
-
-/** Saturate exponential of a fixed point vector
- *
- * @note Implemented approach uses taylor polynomial to approximate the exponential function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type the actual data type.
- * @param[in] size the number of the calculated elements.
- *
- * @return The result of the fixed point exponential. The result is saturated in case of overflow
- */
-#define EXPQ_IMPL(stype, type, size) \
- inline type exp_sat_##type(type VopA, int fixed_point_position) \
- { \
- type const_one = (type)(1 << (fixed_point_position)); \
- type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1); \
- type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one; \
- type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1); \
- type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1); \
- type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1); \
- type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1); \
- type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position); \
- type dec_m = m >> (type)fixed_point_position; \
- type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size, fixed_point_position); \
- alpha = CONVERT(abs_diff(VopA, alpha), type); \
- type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C); \
- sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B); \
- sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A); \
- sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \
- return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), clz(sum) > dec_m); /* Saturate result if needed */ \
- }
-
-EXPQ_IMPL(qs8, qs8x2, 2)
-EXPQ_IMPL(qs8, qs8x4, 4)
-EXPQ_IMPL(qs8, qs8x8, 8)
-EXPQ_IMPL(qs8, qs8x16, 16)
-EXPQ_IMPL(qs16, qs16x2, 2)
-EXPQ_IMPL(qs16, qs16x4, 4)
-EXPQ_IMPL(qs16, qs16x8, 8)
-EXPQ_IMPL(qs16, qs16x16, 16)
-
-#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position))
-#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate logarithm of a fixed point vector
- *
- * @note Implemented approach uses taylor polynomial to approximate the logarithm function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type the actual data type.
- * @param[in] size the number of the calculated elements.
- *
- * @return The result of the fixed point logarithm. The result is saturated in case of overflow
- */
-#define LOGQ_IMPL(stype, type, size) \
- inline type log_sat_##type(type VopA, int fixed_point_position) \
- { \
- type const_one = (type)(1 << (fixed_point_position)); \
- type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */ \
- type A = (type)(0x5C0F >> (14 - fixed_point_position)); /* 1.4384189 */ \
- type B = -(type)(0x56AE >> (15 - fixed_point_position)); /* -0.6771900 */ \
- type C = (type)(0x2933 >> (15 - fixed_point_position)); /* 0.3218538 */ \
- type D = -(type)(0x0AA7 >> (15 - fixed_point_position)); /* -0.0832229 */ \
- type inter_a = select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), VopA < const_one); \
- type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position); \
- inter_a = inter_a >> shift_val; \
- inter_a = sub_sat(inter_a, const_one); \
- type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C); \
- sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B); \
- sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A); \
- sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position); \
- sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype, size, fixed_point_position); \
- return select(select(sum, -sum, VopA < const_one), (type)0, VopA < (type)0); /* Saturate result if needed */ \
- }
-
-LOGQ_IMPL(qs8, qs8x16, 16)
-LOGQ_IMPL(qs16, qs16x8, 8)
-LOGQ_IMPL(qs16, qs16x16, 16)
-
-#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position))
-#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate inverse square root of a fixed point vector
- *
- * @note Implemented approach uses Newton's method to approximate the inverse square root function.
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type the actual data type.
- * @param[in] size the number of the calculated elements.
- *
- * @return The result of the fixed point inverse square root. The result is saturated in case of overflow
- */
-#define INVSQRTQ_IMPL(stype, type, size) \
- inline type invsqrt_sat_##type(type VopA, int fixed_point_position) \
- { \
- type const_three = (type)(3 << (fixed_point_position)); \
- type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position); \
- type temp = select((type)(VopA >> shift_value), select((type)stype##_MAX, (type)(VopA << (-shift_value)), (type)(clz(VopA) > (-shift_value))), (type)(shift_value < (type)0)); \
- type x = temp; \
- x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
- x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
- x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
- if(sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */ \
- { \
- x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
- x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
- } \
- type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0); \
- return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), (type)(clz(x) > shift_value2)), (type)(shift_value < (type)0)); /* Saturate result if needed */ \
- }
-
-INVSQRTQ_IMPL(qs8, qs8x1, 1)
-INVSQRTQ_IMPL(qs16, qs16x1, 1)
-INVSQRTQ_IMPL(qs8, qs8x16, 16)
-INVSQRTQ_IMPL(qs16, qs16x8, 8)
-
-#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position))
-#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position)
-
-/** Saturate hyperbolic tangent of a fixed point vector
- *
- * tanh(x) = (e^2x - 1)/(e^2x + 1)
- *
- * @param[in] stype the actual scalar data type.
- * @param[in] type the actual data type.
- * @param[in] size the number of the calculated elements.
- *
- * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of overflow
- */
-#define TANHQ_IMPL(stype, type, size) \
- inline type tanh_sat_##type(type VopA, int fixed_point_position) \
- { \
- type const_one = (type)(1 << (fixed_point_position)); \
- type const_two = (type)(2 << (fixed_point_position)); \
- type exp2x = EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), stype, size, fixed_point_position); \
- type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size); \
- type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size); \
- return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position); \
- }
-
-TANHQ_IMPL(qs8, qs8x16, 16)
-TANHQ_IMPL(qs16, qs16x8, 8)
-
-#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position))
-#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position)
-
-#define floatx16 float16
-#define float16_TYPE float16
-
-#define CONVERTQ_DOWN_IMPL(in_type, out_type) \
- inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
- { \
- return CONVERT(a * (1 << fixed_point_position) + select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), out_type); \
- }
-
-CONVERTQ_DOWN_IMPL(float16, qs8x16)
-CONVERTQ_DOWN_IMPL(float16, qs16x16)
-
-#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type) \
- inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \
- { \
- return CONVERT_SAT(a * (1 << fixed_point_position) + select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), out_type); \
- }
-
-CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16)
-CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16)
-
-#define CONVERTQ_UP_IMPL(in_type, out_type) \
- inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
- { \
- return CONVERT(a, out_type) / (1 << fixed_point_position); \
- }
-
-CONVERTQ_UP_IMPL(qs8x16, float16)
-CONVERTQ_UP_IMPL(qs16x16, float16)
-
-#define SQCVT_SAT_IMPL(type) \
- inline type sqcvt_##type##_sat(float a, int fixed_point_position) \
- { \
- return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \
- }
-
-SQCVT_SAT_IMPL(qs8)
-SQCVT_SAT_IMPL(qs16)
-
-#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position))
-#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position)
-
-#endif // ARM_COMPUTE_FIXED_POINT_H
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index e969e847d7..f75161ca0a 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -23,10 +23,6 @@
*/
#include "helpers.h"
-#ifdef FIXED_POINT_POSITION
-#include "fixed_point.h"
-#endif // FIXED_POINT_POSITION
-
#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
#if ELEMENT_SIZE == 1
@@ -44,7 +40,7 @@
* @note The transposition width must be passed at compile time using -DTRANSPOSE_W (i.e. -DTRANSPOSE_W)
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
@@ -93,7 +89,7 @@ __kernel void gemm_transpose1xW(TENSOR3D_DECLARATION(src),
* @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
@@ -1085,248 +1081,6 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-#if defined(FIXED_POINT_POSITION)
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
- *
- * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
- * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
- * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
- * @note:ALPHA must be passed in 8 bit fixed point format
- *
- * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8
- * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
- * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst),
- uint src0_stride_z,
- uint src1_stride_z,
- uint dst_stride_z)
-{
- int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
- int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
- int z = get_global_id(2);
-
- // Offset
- const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
- const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 16;
-
- // src_addr_a = address of matrix A
- // src_addr_b = address of matrix B
- int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
- int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- __global char *src_addr_a = (__global char *)(src0_ptr + src0_addr_in_bytes);
- __global char *src_addr_b = (__global char *)(src1_ptr + src1_addr_in_bytes);
-
- // Compute end row address for matrix B
- __global char *src_end_addr_b = src_addr_b + COLS_B;
-
- src_addr_a += offset_row_a;
- src_addr_b += offset_row_b;
-
- // Reset accumulators
- short8 c00 = 0.0f;
- short8 c10 = 0.0f;
- short8 c20 = 0.0f;
- short8 c30 = 0.0f;
- short8 c01 = 0.0f;
- short8 c11 = 0.0f;
- short8 c21 = 0.0f;
- short8 c31 = 0.0f;
-
- // This for loop performs 1 accumulation for each iteration
- for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
- {
- // Load values from matrix A (interleaved) and matrix B (transposed)
- char4 a0 = vload4(0, src_addr_a);
- char16 b0 = vload16(0, src_addr_b);
-
- c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
- c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);
- c20 = mlal_sat_qs8x8(c20, (char8)a0.s2, b0.s01234567, FIXED_POINT_POSITION);
- c30 = mlal_sat_qs8x8(c30, (char8)a0.s3, b0.s01234567, FIXED_POINT_POSITION);
-
- c01 = mlal_sat_qs8x8(c01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
- c11 = mlal_sat_qs8x8(c11, (char8)a0.s1, b0.s89ABCDEF, FIXED_POINT_POSITION);
- c21 = mlal_sat_qs8x8(c21, (char8)a0.s2, b0.s89ABCDEF, FIXED_POINT_POSITION);
- c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);
- }
-
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- // Multiply by the weight of matrix product
- char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));
- char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));
- char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));
- char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));
-
-#if defined(ALPHA)
- c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
- c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
- c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
- c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
-#endif // defined(ALPHA)
-
- // Compute dst address
- __global uchar *dst_addr = offset(&dst, 0, 0);
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
- // Store 16x4 block
- vstore16(c00_qs8, 0, (__global char *)(dst_addr + 0 * dst_stride_y));
- vstore16(c10_qs8, 0, (__global char *)(dst_addr + 1 * dst_stride_y));
- vstore16(c20_qs8, 0, (__global char *)(dst_addr + 2 * dst_stride_y));
- vstore16(c30_qs8, 0, (__global char *)(dst_addr + 3 * dst_stride_y));
-}
-
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
- *
- * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
- * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
- * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
- * @note:ALPHA must be passed in 16 bit fixed point format
- *
- * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16
- * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
- * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst),
- uint src0_stride_z,
- uint src1_stride_z,
- uint dst_stride_z)
-{
- int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
- int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
- int z = get_global_id(2);
-
- // Offset
- const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
- const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
-
- // src_addr_a = address of matrix A
- // src_addr_b = address of matrix B
- int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
- int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- __global short *src_addr_a = (__global short *)(src0_ptr + src0_addr_in_bytes);
- __global short *src_addr_b = (__global short *)(src1_ptr + src1_addr_in_bytes);
-
- // Compute end row address for matrix B
- __global short *src_end_addr_b = src_addr_b + COLS_B;
-
- src_addr_a += offset_row_a;
- src_addr_b += offset_row_b;
-
- // Reset accumulators
- int8 c00 = 0.0f;
- int8 c10 = 0.0f;
- int8 c20 = 0.0f;
- int8 c30 = 0.0f;
-
- // This for loop performs 1 accumulation for each iteration
- for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
- {
- /* Load values from matrix A (interleaved) and matrix B (transposed) */
- short4 a0 = vload4(0, src_addr_a);
- short8 b0 = vload8(0, src_addr_b);
-
- c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);
- c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);
- c20 = mlal_sat_qs16x8(c20, (short8)a0.s2, b0, FIXED_POINT_POSITION);
- c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);
- }
-
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- // Multiply by the weight of matrix product
- short8 c00_qs16 = convert_short8_sat(c00);
- short8 c10_qs16 = convert_short8_sat(c10);
- short8 c20_qs16 = convert_short8_sat(c20);
- short8 c30_qs16 = convert_short8_sat(c30);
-
-#if defined(ALPHA)
- c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
- c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
- c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
- c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
-#endif // defined(ALPHA)
-
- // Compute dst address
- __global uchar *dst_addr = offset(&dst, 0, 0);
-
- // Add offset for batched GEMM
- dst_addr += z * dst_stride_z;
-
- // Store 8x4 block
- vstore8(c00_qs16, 0, (__global short *)(dst_addr + 0 * dst_stride_y));
- vstore8(c10_qs16, 0, (__global short *)(dst_addr + 1 * dst_stride_y));
- vstore8(c20_qs16, 0, (__global short *)(dst_addr + 2 * dst_stride_y));
- vstore8(c30_qs16, 0, (__global short *)(dst_addr + 3 * dst_stride_y));
-}
-#endif // defined(FIXED_POINT_POSITION)
#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
@@ -2543,365 +2297,6 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
}
#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-#if defined(FIXED_POINT_POSITION)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
- *
- * @note This OpenCL kernel works with fixed point data types QS8
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
- * @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
- * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
- * @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
- * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
- * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
- * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst),
- uint src0_stride_z,
- uint src1_stride_z,
- uint dst_stride_z)
-{
- int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
-
- // Compute starting address for matrix A and Matrix B
- int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
- // Update address for the matrix A
- src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
-
- // Update address for the matrix B
- src_addr.s1 += idx * sizeof(char);
-
- // Add offset for batched GEMM
- src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));
-
- short8 acc00 = 0;
- short8 acc01 = 0;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- short8 acc10 = 0;
- short8 acc11 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- short8 acc20 = 0;
- short8 acc21 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- short8 acc30 = 0;
- short8 acc31 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
- // This for loop performs 4 accumulations per iteration
- for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
- {
- char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
- char16 b1 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
-
- acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
- acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s1, b1.s01234567, FIXED_POINT_POSITION);
- acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
- acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s0, b0.s01234567, FIXED_POINT_POSITION);
- acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s1, b1.s01234567, FIXED_POINT_POSITION);
- acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
- acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s0, b0.s01234567, FIXED_POINT_POSITION);
- acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s1, b1.s01234567, FIXED_POINT_POSITION);
- acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
- acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s0, b0.s01234567, FIXED_POINT_POSITION);
- acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s1, b1.s01234567, FIXED_POINT_POSITION);
- acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
- acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- }
-
- // Left-over accumulations
- for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
- {
- char a0 = *((__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- char a1 = *((__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- char a2 = *((__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- char a3 = *((__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1));
-
- acc00 = mlal_sat_qs8x8(acc00, (char8)a0, b0.s01234567, FIXED_POINT_POSITION);
- acc01 = mlal_sat_qs8x8(acc01, (char8)a0, b0.s89ABCDEF, FIXED_POINT_POSITION);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- acc10 = mlal_sat_qs8x8(acc10, (char8)a1, b0.s01234567, FIXED_POINT_POSITION);
- acc11 = mlal_sat_qs8x8(acc11, (char8)a1, b0.s89ABCDEF, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- acc20 = mlal_sat_qs8x8(acc20, (char8)a2, b0.s01234567, FIXED_POINT_POSITION);
- acc21 = mlal_sat_qs8x8(acc21, (char8)a2, b0.s89ABCDEF, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- acc30 = mlal_sat_qs8x8(acc30, (char8)a3, b0.s01234567, FIXED_POINT_POSITION);
- acc31 = mlal_sat_qs8x8(acc31, (char8)a3, b0.s89ABCDEF, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- }
-
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- // Compute dst address
- __global uchar *dst_addr = offset(&dst, 0, 0);
-
- // Add offset for batched GEMM
- dst_addr += get_global_id(2) * dst_stride_z;
-
- // Multiply by the weight of matrix product and store the result
- char16 acc_qs8;
- acc_qs8 = convert_char16_sat((short16)(acc00, acc01));
-#if defined(ALPHA)
- acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
-#endif // defined(ALPHA)
- vstore16(acc_qs8, 0, (__global char *)(dst_addr + 0 * dst_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- acc_qs8 = convert_char16_sat((short16)(acc10, acc11));
-#if defined(ALPHA)
- acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
-#endif // defined(ALPHA)
- vstore16(acc_qs8, 0, (__global char *)(dst_addr + 1 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- acc_qs8 = convert_char16_sat((short16)(acc20, acc21));
-#if defined(ALPHA)
- acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
-#endif // defined(ALPHA)
- vstore16(acc_qs8, 0, (__global char *)(dst_addr + 2 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- acc_qs8 = convert_char16_sat((short16)(acc30, acc31));
-#if defined(ALPHA)
- acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
-#endif // defined(ALPHA)
- vstore16(acc_qs8, 0, (__global char *)(dst_addr + 3 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-}
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
- *
- * @note This OpenCL kernel works with fixed point data types QS16
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
- * @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
- * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
- * @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
- * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
- * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
- * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst),
- uint src0_stride_z,
- uint src1_stride_z,
- uint dst_stride_z)
-{
- int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
-
- // Compute starting address for matrix A and Matrix B
- int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
- // Update address for the matrix A
- src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
-
- // Update address for the matrix B
- src_addr.s1 += idx * sizeof(short);
-
- // Add offset for batched GEMM
- src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));
-
- int8 acc0 = 0;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- int8 acc1 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- int8 acc2 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- int8 acc3 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
- // This for loop performs 4 accumulations per iteration
- for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y))
- {
- short2 a0 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- short2 a1 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- short2 a2 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- short2 a3 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
- short8 b1 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
-
- acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s0, b0, FIXED_POINT_POSITION);
- acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s1, b1, FIXED_POINT_POSITION);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s0, b0, FIXED_POINT_POSITION);
- acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s1, b1, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s0, b0, FIXED_POINT_POSITION);
- acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s1, b1, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s0, b0, FIXED_POINT_POSITION);
- acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s1, b1, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- }
-
- // Left-over accumulations
- for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(short), src1_stride_y))
- {
- short a0 = *((__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- short a1 = *((__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- short a2 = *((__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- short a3 = *((__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1));
-
- acc0 = mlal_sat_qs16x8(acc0, (short8)a0, b0, FIXED_POINT_POSITION);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- acc1 = mlal_sat_qs16x8(acc1, (short8)a1, b0, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- acc2 = mlal_sat_qs16x8(acc2, (short8)a2, b0, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- acc3 = mlal_sat_qs16x8(acc3, (short8)a3, b0, FIXED_POINT_POSITION);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- }
-
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- // Compute dst address
- __global uchar *dst_addr = offset(&dst, 0, 0);
-
- // Add offset for batched GEMM
- dst_addr += get_global_id(2) * dst_stride_z;
-
- // Multiply by the weight of matrix product and store the result
- short8 acc_qs16;
- acc_qs16 = convert_short8_sat(acc0);
-#if defined(ALPHA)
- acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
-#endif // defined(ALPHA)
- vstore8(acc_qs16, 0, (__global short *)(dst_addr + 0 * dst_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- acc_qs16 = convert_short8_sat(acc1);
-#if defined(ALPHA)
- acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
-#endif // defined(ALPHA)
- vstore8(acc_qs16, 0, (__global short *)(dst_addr + 1 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- acc_qs16 = convert_short8_sat(acc2);
-#if defined(ALPHA)
- acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
-#endif // defined(ALPHA)
- vstore8(acc_qs16, 0, (__global short *)(dst_addr + 2 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- acc_qs16 = convert_short8_sat(acc3);
-#if defined(ALPHA)
- acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
-#endif // defined(ALPHA)
- vstore8(acc_qs16, 0, (__global short *)(dst_addr + 3 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-}
-#endif // defined(FIXED_POINT_POSITION)
#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
#if defined(BETA)
@@ -2988,94 +2383,6 @@ __kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
vstore8(out, 0, (__global half *)dst.ptr);
}
#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-
-#if defined(FIXED_POINT_POSITION)
-/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @note The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
- *
- * @note: BETA must be passed in 8 bit fixed point format
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: QS8
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_ma_qs8(TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- // Compute source and destination addresses
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
- // Load values from A x B
- char16 alpha_ab = vload16(0, (__global char *)dst.ptr);
-
- // Load values from Matrix C
- char16 c = vload16(0, (__global char *)src.ptr);
-
- // Computes alpha * axb + beta * c
- char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);
-
- // Store final result in axb matrix
- vstore16(out, 0, (__global char *)dst.ptr);
-}
-
-/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @note The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
- *
- * @note: BETA must be passed in 16 bit fixed point format
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: QS16
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_ma_qs16(TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- // Compute source and destination addresses
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
- // Load values from A x B
- short8 alpha_ab = vload8(0, (__global short *)dst.ptr);
-
- // Load values from Matrix C
- short8 c = vload8(0, (__global short *)src.ptr);
-
- // Computes alpha * axb + beta * c
- short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);
-
- // Store final result in axb matrix
- vstore8(out, 0, (__global short *)dst.ptr);
-}
-#endif // defined(FIXED_POINT_POSITION)
#endif // defined(BETA)
#if defined(WIDTH_VECTOR_A)
@@ -3151,7 +2458,7 @@ __kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
*
- * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+ * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/U16/S16/F16/U32/S32/F32
* @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
* @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
@@ -3175,11 +2482,7 @@ __kernel void gemm_accumulate_biases(
accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
-#ifdef FIXED_POINT_POSITION
- accum_value = ADD_SAT_OP_EXPAND(biases_value, accum_value, DATA_TYPE, VECTOR_SIZE);
-#else // FIXED_POINT_POSITION
- accum_value = biases_value + accum_value;
-#endif // FIXED_POINT_POSITION
+ accum_value = biases_value + accum_value;
// Store result in the accumulate buffer
VSTORE(VECTOR_SIZE)
(accum_value, 0, (__global DATA_TYPE *)accum.ptr);
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 6f25ad4b7a..d034b30b68 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -23,12 +23,7 @@
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-#include "fixed_point.h"
-#endif // FIXED_POINT_POSITION
-
#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
-#if !defined(FIXED_POINT_POSITION)
#if ELEMENT_SIZE == 1
#define COND_DATA_TYPE char
@@ -50,7 +45,7 @@
* @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -139,7 +134,7 @@ __kernel void im2col1x1_stridex1_dchw(
* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -232,7 +227,7 @@ __kernel void im2col_generic_nhwc(
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -338,7 +333,7 @@ __kernel void im2col3x3_nhwc(
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -425,7 +420,7 @@ __kernel void im2col3x3_dchw(
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -598,7 +593,7 @@ __kernel void im2col5x5_dchw(
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -788,7 +783,6 @@ __kernel void im2col11x11_padx0_pady0_dchw(
#endif // HAS_BIAS
}
#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
-#endif // !defined(FIXED_POINT_POSITION)
#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
@@ -799,7 +793,7 @@ __kernel void im2col11x11_padx0_pady0_dchw(
* @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -863,11 +857,7 @@ __kernel void im2col_generic_padx0_pady0_dchw(
#ifdef HAS_BIAS
if(ch == (KERNEL_DEPTH - 1))
{
-#ifdef FIXED_POINT_POSITION
- *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
-#else // FIXED_POINT_POSITION
*output_ptr = 1.0f;
-#endif // FIXED_POINT_POSITION
}
#endif // HAS_BIAS
}
@@ -886,7 +876,7 @@ __kernel void im2col_generic_padx0_pady0_dchw(
* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -950,11 +940,7 @@ __kernel void im2col_generic_dchw(
#ifdef HAS_BIAS
if(ch == (KERNEL_DEPTH - 1))
{
-#ifdef FIXED_POINT_POSITION
- *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
-#else // FIXED_POINT_POSITION
*output_ptr = 1.0f;
-#endif // FIXED_POINT_POSITION
}
#endif // HAS_BIAS
}
@@ -966,7 +952,7 @@ __kernel void im2col_generic_dchw(
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
* @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -999,11 +985,7 @@ __kernel void im2col_reduced_dchw(
if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
{
tmp_out_ptr += dst_stride_x;
-#ifdef FIXED_POINT_POSITION
- *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
-#else // FIXED_POINT_POSITION
*((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1.0f;
-#endif // FIXED_POINT_POSITION
}
#endif // HAS_BIAS
}
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
index 8d47631019..f58e98bace 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/l2_normalize.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,11 +28,11 @@
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_ptr Pointer to the source tensor. Supported data types: QS8/F16/F32
+ * @param[in] sum_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] sum_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the source tensor
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index bc00252fbd..dbdad27865 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,22 +23,6 @@
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-
-#include "fixed_point.h"
-#define MUL_OP(x, y) MUL_SAT_OP_EXPAND((x), (y), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define ADD_OP(x, y) ADD_SAT_OP_EXPAND((x), (y), DATA_TYPE, VEC_SIZE)
-#define DIV_OP(x, y) DIV_SAT_OP_VEC_EXPAND((x), (y), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define EXP_OP(x) EXP_OP_EXPAND((x), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define LOG_OP(x) LOG_OP_EXPAND((x), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
-#define POW_OP(x, y) EXP_OP(MUL_OP(LOG_OP((x)), (y)))
-#define SQCVT_SAT(a) SQCVT_SAT_OP_EXPAND((a), DATA_TYPE, FIXED_POINT_POSITION)
-
-#define LOAD_OP(offset, ptr) vload16(offset, ptr)
-#define STORE_OP(data, offset, ptr) vstore16(data, offset, ptr)
-
-#else // FIXED_POINT_POSITION
-
#define MUL_OP(x, y) ((x) * (y))
#define ADD_OP(x, y) ((x) + (y))
#define DIV_OP(x, y) ((x) / (y))
@@ -48,18 +32,15 @@
#define LOAD_OP(offset, ptr) vload4(offset, ptr)
#define STORE_OP(data, offset, ptr) vstore4(data, offset, ptr)
-#endif // FIXED_POINT_POSITION
-
/** Apply cross-map normalization.
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
* @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
* @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
- * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
* @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
*
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
@@ -116,10 +97,9 @@ __kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
* @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
* @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
*
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QS8/F16/F32
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/permute.cl
index 6f978c9b70..03fc15e4e8 100644
--- a/src/core/CL/cl_kernels/permute.cl
+++ b/src/core/CL/cl_kernels/permute.cl
@@ -29,7 +29,7 @@
* @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
* @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
@@ -63,7 +63,7 @@ __kernel void permute_201(
* @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
* @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
@@ -97,7 +97,7 @@ __kernel void permute_120(
* @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
* @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index b5734a39ed..c99a08a583 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,18 +23,6 @@
*/
#include "helpers.h"
-#if defined(FIXED_POINT_POSITION)
-
-#include "fixed_point.h"
-
-#if defined(SATURATE)
-#define MUL_OP(x, y, scale, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
-#else // SATURATE
-#define MUL_OP(x, y, scale, type, size) MUL_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
-#endif // SATURATE
-
-#else // FIXED_POINT_POSITION
-
#if defined(SATURATE)
#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
#else // SATURATE
@@ -44,17 +32,14 @@
#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
-#endif // FIXED_POINT_POSITION
-
/** Performs a pixelwise multiplication with integer scale of integer inputs.
*
* @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
* @attention The data_type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
* e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
- * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
*
- * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16
+ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/S16
* @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
@@ -78,7 +63,7 @@
* @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
* @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1).
+ * @param[in] scale Integer scaling factor. Supported data types: S32.
*/
__kernel void pixelwise_mul_int(
TENSOR3D_DECLARATION(in1),
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 2c7ddfdf23..c38a78ce3e 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -23,28 +23,6 @@
*/
#include "helpers.h"
-#ifdef FIXED_POINT_POSITION
-
-#include "fixed_point.h"
-
-#if defined(POOL_AVG)
-#define POOL_OP(x, y) add_sat(x, y)
-#else /* POOL_AVG */
-#define POOL_OP(x, y) (max((x), (y)))
-#endif /* POOL_AVG */
-
-#define DIV_OP1(x, y) DIV_SAT_OP_EXPAND((x), (y), DATA_TYPE, FIXED_POINT_POSITION)
-#define DIV_OP(x, y) DIV_OP1(x, y << FIXED_POINT_POSITION)
-#define SQRT_OP(x) DIV_OP1((1 << FIXED_POINT_POSITION), (INVSQRT_OP_EXPAND((x), DATA_TYPE, 1, FIXED_POINT_POSITION)))
-
-#if defined(POOL_L2)
-#define POW2_OP(x, vec_size) MUL_SAT_OP_EXPAND((x), (x), DATA_TYPE, vec_size, FIXED_POINT_POSITION)
-#else /* defined(POOL_L2) */
-#define POW2_OP(x, vec_size) (x)
-#endif /* defined(POOL_L2) */
-
-#else /* FIXED_POINT_POSITION */
-
#if defined(POOL_AVG) || defined(POOL_L2)
#define POOL_OP(x, y) ((x) + (y))
#else /* defined(POOL_AVG) || defined(POOL_L2) */
@@ -60,8 +38,6 @@
#define DIV_OP(x, y) (x * (1.f / y))
#define SQRT_OP(x) sqrt((x))
-#endif /* FIXED_POINT_POSITION */
-
#define DIV_OP_NHWC(x, y) (x * (VEC_DATA_TYPE(DATA_TYPE, 8))(1.f / y))
#if STRIDE_X == 1
@@ -201,14 +177,14 @@ DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, cons
/** Performs a pooling function of pool size equal to 2.
*
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
* @note In case of average pooling the following information must be passed at compile time:
* -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
@@ -265,14 +241,14 @@ __kernel void pooling_layer_2(
/** Performs a pooling function of pool size equal to 3
*
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
* @note In case of average pooling the following information must be passed at compile time:
* -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
@@ -331,7 +307,7 @@ __kernel void pooling_layer_3(
*(__global DATA_TYPE *)output.ptr = res;
}
-#if defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
+#if defined(POOLING3x3)
#define CONVERT_OP(data_type) convert_##data_type##4
#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
@@ -353,7 +329,7 @@ calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upp
/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
*
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
* @note In case of average pooling the following information must be passed at compile time:
* -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
* -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
@@ -403,7 +379,7 @@ __kernel void pooling_layer_optimized_3(
vstore4(res, 0, (__global DATA_TYPE *)output.ptr);
}
-#endif // defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
+#endif // defined(POOLING3x3)
#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
@@ -411,23 +387,17 @@ __kernel void pooling_layer_optimized_3(
#if defined(POOL_AVG) || defined(POOL_L2)
#define INITIAL_VALUE 0
#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#ifdef FIXED_POINT_POSITION
-#define MIN_VAL_EXPAND(type) type##_MIN
-#define MIN_VAL(type) MIN_VAL_EXPAND(type)
-#define INITIAL_VALUE MIN_VAL(DATA_TYPE)
-#else // FIXED_POINT_POSITION
#if FP16
#define INITIAL_VALUE -HALF_MAX
#else // FP16
#define INITIAL_VALUE -FLT_MAX
#endif // FP16
-#endif // FIXED_POINT_POSITION
#endif // POOL_AVG
/** Performs a pooling function of pool size equal to N (NCHW)
*
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
* @note -DFP16 must be passed at compile time if half float data type is used
* @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
* @note In case of average pooling the following information must be passed at compile time:
@@ -436,7 +406,7 @@ __kernel void pooling_layer_optimized_3(
* -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/reshape_layer.cl b/src/core/CL/cl_kernels/reshape_layer.cl
index 23eccbf817..11393d246d 100644
--- a/src/core/CL/cl_kernels/reshape_layer.cl
+++ b/src/core/CL/cl_kernels/reshape_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
*
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
* @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index aa1fa01c53..e549b44245 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -23,23 +23,6 @@
*/
#include "helpers.h"
-#ifdef FIXED_POINT_POSITION
-
-#include "fixed_point.h"
-#define MAX_OP(x, y, type, size) MAX_OP_EXPAND(x, y, type, size)
-#define ADD_OP(x, y, type, size) ADD_SAT_OP_EXPAND((x), (y), type, size)
-#define SUB_OP(x, y, type, size) SUB_SAT_OP_EXPAND((x), (y), type, size)
-#define MUL_OP(x, y, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
-#define DIV_OP(x, y, type, size) DIV_SAT_OP_VEC_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
-#define EXP_OP(x, type, size) EXP_OP_EXPAND((x), type, size, FIXED_POINT_POSITION)
-
-#define MIN_VAL_EXPAND(type) type##_MIN
-#define MIN_VAL(type) MIN_VAL_EXPAND(type)
-#define MINVAL MIN_VAL(DATA_TYPE)
-#define SELECT_DATA_TYPE EXPAND(DATA_TYPE)
-
-#else /* FIXED_POINT_POSITION */
-
#define MAX_OP(x, y, type, size) max((x), (y))
#define ADD_OP(x, y, type, size) ((x) + (y))
#define SUB_OP(x, y, type, size) ((x) - (y))
@@ -55,8 +38,6 @@
#define SELECT_DATA_TYPE int
#endif /* USE_F16 */
-#endif /* FIXED_POINT_POSITION */
-
/* Number of workitems in dimension 0. */
#if !defined(GRID_SIZE)
#define GRID_SIZE 1
@@ -91,9 +72,8 @@ __constant uint4 idx4 = (uint4)(0, 1, 2, 3);
/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
*
* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -138,11 +118,10 @@ __kernel void softmax_layer_norm(
* then gets the exponent of each element as sums all elements across each row.
*
* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -288,11 +267,10 @@ __kernel void softmax_layer_max_shift_exp_sum_serial(
* then gets the exponent of each element as sums all elements across each row.
*
* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
* @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
index c055381fc5..95d6d4bcc5 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
@@ -230,10 +230,9 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
* then gets the exponent of each element as sums all elements across each row.
*
* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
* @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -519,7 +518,6 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
*
- * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
* @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
* @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
*
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 3d8824aa2a..1ae1032cba 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -27,7 +27,6 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/TensorInfo.h"
@@ -47,7 +46,7 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
&& (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
&& (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU),
@@ -58,7 +57,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
@@ -118,7 +116,6 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
float a_const = act_info.a();
float b_const = act_info.b();
int a_const_int = 0;
@@ -127,16 +124,8 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
// Create quantized version of constants a, b if needed
if(is_data_type_quantized(dt))
{
- if(is_data_type_fixed_point(dt))
- {
- a_const_int = static_cast<int>(lround(a_const * (1 << fixed_point_position)));
- b_const_int = static_cast<int>(lround(b_const * (1 << fixed_point_position)));
- }
- else
- {
- a_const_int = input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP);
- b_const_int = input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
- }
+ a_const_int = input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP);
+ b_const_int = input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
}
// Set build options
@@ -177,10 +166,6 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
}
build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
- if(is_data_type_fixed_point(dt))
- {
- build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fixed_point_position)));
- }
// Create kernel
std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
index 011807ad88..78651f8679 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
@@ -37,9 +37,9 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
{
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
const bool is_qasymm = is_data_type_quantized_asymmetric(input1.data_type()) || is_data_type_quantized_asymmetric(input2.data_type());
if(is_qasymm)
@@ -50,18 +50,16 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &input2);
// Validate in case of configured output
if(output.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
"Output can only be U8 if both inputs are U8");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
"Wrong shape for output");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &output);
if(is_qasymm)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
@@ -142,11 +140,7 @@ void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTen
build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- if(is_data_type_fixed_point(input1->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
- }
- else if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
+ if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
{
build_opts.emplace("-DOFFSET=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
kernel_name += "_quantized";
diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
index db91bc0084..aeee6022a7 100644
--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,21 +44,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
{
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input2);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
// Validate in case of configured output
if((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
}
return Status{};
@@ -122,10 +120,6 @@ void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICL
build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- if(is_data_type_fixed_point(input1->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
- }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 391baef96a..5999c66056 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -27,7 +27,6 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -46,22 +45,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
if(beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta);
}
if(gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma);
}
if(act_info.enabled())
@@ -78,7 +74,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
@@ -168,7 +163,6 @@ void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *out
build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
- build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
build_opts.add_option_if(beta == nullptr, "-DUSE_DEFAULT_BETA");
build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index 1de987264c..5f0f0aebf8 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -39,8 +39,8 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
- DataType::U16, DataType::S16, DataType::QS16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 64e6a0b7d8..6274c9082a 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -44,14 +44,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, s
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
// Checks performed when output is configured
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
@@ -64,7 +63,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims)));
- const unsigned int num_elems_read_per_iteration = is_data_type_fixed_point(input->data_type()) ? 1 : 8;
+ const unsigned int num_elems_read_per_iteration = 8;
// Configure window
Window win = calculate_max_window(*input, Steps(num_elems_read_per_iteration));
@@ -106,7 +105,6 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p
build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first));
- build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts.options()));
diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
index c3cd494662..a39d1f4a0b 100644
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
@@ -75,7 +75,7 @@ Status CLConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input,
DataLayout data_layout)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::QS32, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 204f9aed6f..72dc21197d 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,9 +62,8 @@ void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned i
};
ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
index 83908a1469..2f5b2466b1 100644
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,21 +40,15 @@ using namespace arm_compute;
void CLDepthConvertLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
- DataType::U16, DataType::U32, DataType::S32, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16,
+ DataType::U16, DataType::U32, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
DataType::U16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_ERROR_ON(input == output);
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different");
ARM_COMPUTE_ERROR_ON(shift >= 8);
// Check if convertion is supported
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32,
- "Only data types supported [in] QS8 -> [out] F32");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::F32),
- "Only data types supported [in] QS16 -> [out] F32");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && ((output->info()->data_type() != DataType::QS8) && output->info()->data_type() != DataType::QS16),
- "Only data types supported [in] F32 -> [out] QS8, QS16");
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16
&& output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32),
"Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
@@ -99,10 +93,6 @@ void CLDepthConvertLayerKernel::configure(const ICLTensor *input, ICLTensor *out
}
build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- if(is_data_type_fixed_point(input->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
- }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 1de08aa1a2..9d9c280182 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -146,7 +146,6 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input,
output_shape,
1,
input->info()->data_type(),
- input->info()->fixed_point_position(),
input->info()->quantization_info());
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index bef13f9b1c..cab943629a 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
@@ -53,7 +53,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != output->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
index c97ecaf8e0..e124ee42f3 100644
--- a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
@@ -61,7 +61,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, s
TensorShape output_shape = compute_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
diff --git a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
index fd3b75484a..c28be3fccf 100644
--- a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
@@ -46,7 +46,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (biases != nullptr));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(idx_w) * input->dimension(idx_h) + ((biases != nullptr) ? 1 : 0)));
@@ -54,7 +53,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
if(biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != input->dimension(idx_c));
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
}
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index fa982d6cf2..fba721f50b 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -54,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
{
// Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32, 0);
+ auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
constexpr unsigned int num_elems_processed_per_iteration = 4;
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index d2794d7abd..dcb4ac1c5d 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -45,7 +45,7 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),
"Weights should have same width as length");
@@ -84,7 +84,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
@@ -103,7 +102,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
auto_init_if_empty(*output, output_shape,
1,
input->data_type(),
- input->fixed_point_position(),
input->quantization_info());
unsigned int conv_stride_x = std::get<0>(conv_info.stride());
@@ -265,7 +263,6 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
output_shape,
1,
input->info()->data_type(),
- input->info()->fixed_point_position(),
input->info()->quantization_info());
// Perform validation step
@@ -302,18 +299,14 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
}
else
{
- bool is_quantized_fixed_point = is_data_type_fixed_point(data_type);
- bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
- DataType promoted_type = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;
+ bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
- build_options.add_option_if(is_quantized_fixed_point,
- std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
- build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type)));
+ build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_1x1_3x3_5x5_quantized" : kernel_name.str(),
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 66504e67b5..3b1edaf46c 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,10 +91,6 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo
build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom)));
build_opts.emplace(("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left)));
build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right)));
- if(is_data_type_fixed_point(tensor->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION");
- }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
@@ -125,14 +121,12 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo
case DataType::QASYMM8:
set_constant_border<uint8_t>(idx, constant_border_value);
break;
- case DataType::QS8:
case DataType::S8:
set_constant_border<int8_t>(idx, constant_border_value);
break;
case DataType::U16:
set_constant_border<uint16_t>(idx, constant_border_value);
break;
- case DataType::QS16:
case DataType::S16:
set_constant_border<int16_t>(idx, constant_border_value);
break;
diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
index 11f8e33319..f6b0e829a0 100644
--- a/src/core/CL/kernels/CLFloorKernel.cpp
+++ b/src/core/CL/kernels/CLFloorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void CLFloorKernel::configure(const ICLTensor *input, ICLTensor *output)
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Auto initialize output
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index ba475f5819..12a40cd7dc 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -44,15 +44,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
{
ARM_COMPUTE_RETURN_ERROR_ON(mult_interleave4x4_height < 1);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
- DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input, mult_interleave4x4_height));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index 3f705ac0a7..e040122663 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -172,7 +172,7 @@ void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const IC
tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
- auto_init_if_empty(*output->info(), tensor_shape, 1, DataType::S32, 1, QuantizationInfo());
+ auto_init_if_empty(*output->info(), tensor_shape, 1, DataType::S32, QuantizationInfo());
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index 81e455fce8..04cf627818 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -41,9 +41,8 @@ namespace
Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(accum);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1);
return Status{};
@@ -95,8 +94,6 @@ void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTe
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type()));
build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
- build_opts.add_option_if(is_data_type_fixed_point(accum->info()->data_type()),
- "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(accum->info()->fixed_point_position()));
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts.options()));
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index c50ee24a70..bcc3a01296 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -29,7 +29,6 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Window.h"
@@ -64,7 +63,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
ARM_COMPUTE_UNUSED(input, output, beta);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -88,19 +87,7 @@ void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *ou
_output = output;
std::ostringstream ma_arguments;
- if(is_data_type_fixed_point(input->info()->data_type()))
- {
- ma_arguments << "-DBETA=" << (input->info()->data_type() == DataType::QS8 ?
- sqcvt_qs8_f32(beta, input->info()->fixed_point_position()) :
- sqcvt_qs16_f32(beta, input->info()->fixed_point_position()))
- << " ";
- ma_arguments << "-DFIXED_POINT_POSITION=" << input->info()->fixed_point_position();
- }
- else
- {
- ma_arguments << "-DBETA=" << beta;
- }
-
+ ma_arguments << "-DBETA=" << beta;
std::set<std::string> build_opts;
build_opts.emplace(ma_arguments.str());
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 2c2a92d070..814cbb631f 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -31,7 +31,6 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
@@ -53,10 +52,8 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_fixed_point(input0->data_type()) && (reshape_info.depth_output_gemm3d() != 1), "GEMM3D only supports floating point data types");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
@@ -95,7 +92,6 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
}
return Status{};
@@ -219,7 +215,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
_slide_matrix_b = _input1->info()->num_dimensions() >= _input0->info()->num_dimensions();
const DataType data_type = input0->info()->data_type();
- const int fp_pos = input0->info()->fixed_point_position();
// Get target architecture
GPUTarget gpu_target = get_target();
@@ -236,14 +231,11 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
// Create build options
CLBuildOptions build_opts;
- build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fp_pos));
// Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications.
if(std::abs(1.0f - alpha) > 0.00001f)
{
- build_opts.add_option_if_else(is_data_type_fixed_point(data_type),
- "-DALPHA=" + support::cpp11::to_string((data_type == DataType::QS8 ? sqcvt_qs8_f32(alpha, fp_pos) : sqcvt_qs16_f32(alpha, fp_pos))),
- "-DALPHA=" + float_to_string_with_full_precision(alpha));
+ build_opts.add_option("-DALPHA=" + float_to_string_with_full_precision(alpha));
}
build_opts.add_option_if(_is_gemm3d, "-DREINTERPRET_OUTPUT_AS_3D");
build_opts.add_option_if(_is_gemm3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
@@ -299,10 +291,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
// via exhaustive autotuning over a range of representative layer configurations.
_lws_hint = cl::NDRange(4);
}
- else if(is_data_type_fixed_point(data_type))
- {
- kernel_name = "gemm_mm_" + lower_string(string_from_data_type(data_type));
- }
else // (MIDGARD and F32) or (F16)
{
kernel_name = "gemm_mm_floating_point";
diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index d8ecd501b0..43a6cf25db 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
@@ -42,7 +42,6 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 7a8a1e529d..7e44fa7118 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -47,8 +47,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
{
ARM_COMPUTE_RETURN_ERROR_ON(mult_transpose1xW_width < 1);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
- DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
if(output->total_size() != 0)
@@ -56,7 +56,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 5d4e039e94..b54575ae30 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -48,7 +48,7 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
@@ -58,7 +58,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, b
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
@@ -136,7 +135,7 @@ CLIm2ColKernel::configure_window(const ICLTensor *input, ICLTensor *output, cons
if(dilation == Size2D(1U, 1U))
{
- if(squared_im2col && !is_data_type_fixed_point(data_type))
+ if(squared_im2col)
{
// Check if we can run an optimized im2col
switch(kernel_dims.width)
@@ -304,7 +303,6 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
build_opts.add_option_if(has_bias, "-DHAS_BIAS");
- build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
_num_elems_processed_per_iteration = 1;
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 3d30350c59..39d9f958d3 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -26,7 +26,6 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -78,7 +77,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
// Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type(), input->fixed_point_position());
+ auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index 60dd5e7de3..9493ddc878 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
@@ -62,7 +62,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
TensorShape output_shape = compute_min_max_shape(input);
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+ auto_init_if_empty(*output, output_shape, 1, input->data_type());
const unsigned int num_elems_processed_per_iteration = 1;
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 5456876ee8..df01eab240 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,6 @@
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -40,24 +39,16 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
- if(is_data_type_fixed_point(input->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
- ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
- ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
- }
-
// Checks performed when output is configured
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
@@ -74,7 +65,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
const BorderSize border_size = BorderSize(0, border_width);
- const unsigned int num_elems_processed_per_iteration = (is_data_type_fixed_point(input->data_type())) ? 16 : 4;
+ const unsigned int num_elems_processed_per_iteration = 4;
const unsigned int num_elems_read_per_iteration = is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) : num_elems_processed_per_iteration;
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -119,14 +110,12 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
const unsigned int border_width = _is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
_border_size = BorderSize(0, border_width);
- const unsigned int num_elems_processed_per_iteration = (is_data_type_fixed_point(input->info()->data_type())) ? 16 : 4;
+ const unsigned int num_elems_processed_per_iteration = 4;
const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
// Set build options
CLBuildOptions build_opts;
build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()),
- "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
build_opts.add_option(("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index 168ab81088..7c0c95be1c 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -52,8 +52,8 @@ TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
- DataType::U16, DataType::S16, DataType::QS16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm != PermutationVector{ 2U, 0U, 1U })
@@ -68,7 +68,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
}
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index a9df36dfcc..4ea093fe04 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -51,36 +51,23 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_UNUSED(rounding_policy);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input2);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
-
- if(is_data_type_fixed_point(input1->data_type()))
- {
- // All data types must be all QS8 or all QS16
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1, "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
- }
// Validate in case of configured output
if(output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
- if(is_data_type_fixed_point(input1->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
- }
}
return Status{};
@@ -174,14 +161,6 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I
{
compute_type = "int";
}
- else if(input1->info()->data_type() == DataType::QS8)
- {
- compute_type = "qs8";
- }
- else if(input1->info()->data_type() == DataType::QS16)
- {
- compute_type = "qs16";
- }
else
{
compute_type = "ushort";
@@ -197,10 +176,6 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I
std::set<std::string> build_opts;
build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte");
- if(is_data_type_fixed_point(input1->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
- }
build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 81c52ed53b..246ab68130 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -62,7 +62,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
switch(data_layout)
{
case DataLayout::NCHW:
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
break;
case DataLayout::NHWC:
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -78,8 +78,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
- TensorInfo out_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, output->data_type(), output->fixed_point_position()));
+ TensorInfo out_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, output->data_type()));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
}
@@ -214,8 +213,6 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
- build_opts.add_option_if(is_data_type_fixed_point(data_type),
- "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
@@ -240,7 +237,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
{
// Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
// each thread computes 4 output elements
- const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
+ const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3);
std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
+ support::cpp11::to_string(pool_size_x);
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index 028e50821f..af751f4832 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
@@ -54,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
{
// Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8, 0);
+ auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8);
constexpr unsigned int num_elems_processed_per_iteration = 4;
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 51873ff66a..4048e927f5 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -56,7 +56,7 @@ void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLROIArra
// Output auto inizialitation if not yet initialized
TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index c44fced3e3..d64f0d89c5 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -27,7 +27,6 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -65,7 +64,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
// Output tensor auto initialization if not yet initialized
TensorShape output_shape{ input->tensor_shape() };
output_shape.set(axis, 1);
- auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+ auto_init_if_empty(*output, output_shape, 1, input->data_type());
const unsigned int num_elems_processed_per_iteration = 16;
@@ -118,10 +117,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
std::set<std::string> build_opts;
build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
- if(is_data_type_fixed_point(input->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
- }
switch(op)
{
diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
index 15897c9dd7..ce9d7fff67 100644
--- a/src/core/CL/kernels/CLReshapeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,12 +47,11 @@ CLReshapeLayerKernel::CLReshapeLayerKernel()
void CLReshapeLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
- DataType::U16, DataType::S16, DataType::QS16,
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 6a18e5ffce..b9ebdc9583 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -82,11 +82,10 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float
Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max);
const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
@@ -102,7 +101,6 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITens
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
}
// Checks performed when sum is configured
@@ -117,7 +115,6 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITens
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
}
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum);
}
return Status{};
@@ -126,10 +123,9 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITens
Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum);
// Note: output should always have a scale of 1/256 and offset 0
const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
@@ -139,7 +135,6 @@ Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *su
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
if(!is_quantized_asymmetric)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -239,15 +234,11 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor
const DataType dt = input->info()->data_type();
const size_t reduction_dim_size = input->info()->dimension(0);
- auto beta_int = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
- build_opts.add_option_if(is_data_type_fixed_point(dt),
- "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
- build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), "-DBETA=" + support::cpp11::to_string(beta_int));
build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
@@ -364,8 +355,6 @@ void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *su
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()),
- "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
build_opts.add_options_if(is_quantized_asymmetric,
prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 8260606a7d..3d584345d7 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -57,8 +57,8 @@ TensorShape transposed_tensor_shape(const TensorShape &in)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
- DataType::U16, DataType::S16, DataType::QS16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
@@ -68,7 +68,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index b012d58d59..5243c4099e 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -42,13 +42,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
if(biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
@@ -60,7 +59,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
@@ -96,7 +94,6 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
- build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
// Create kernel
std::string kernel_name = std::string("reshape_to_columns_") + lower_string(string_from_data_layout(data_layout));
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index 56d6ec8f16..587ba690c2 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
@@ -60,10 +60,9 @@ Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, c
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::F16, DataType::U32,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) + width_offset > output->dimension(0));
for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)