aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2017-07-26 10:28:40 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-09-17 14:16:42 +0100
commitf87cc7f6fef95f9b022725304118796a6a764a7c (patch)
tree06a643c47c93ba1a64dcca1ae787214a6fbfff54 /arm_compute/core
parent6c928343b0fa2bf60ffdfe21aea28b598d742ed4 (diff)
downloadComputeLibrary-f87cc7f6fef95f9b022725304118796a6a764a7c.tar.gz
COMPMID-417: Port NEDirectConvolution 1x1 to QS16.
Change-Id: Icae6a5091e836d0aca24375f43cca9e6d3a2090f Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81662 Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'arm_compute/core')
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.h11
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.inl14
-rw-r--r--arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h2
-rw-r--r--arm_compute/core/Types.h1
-rw-r--r--arm_compute/core/Utils.h3
6 files changed, 31 insertions, 2 deletions
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 08f680801d..3de226112e 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -48,6 +48,7 @@ using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 element
using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
using qint32x2_t = int32x2_t; /**< 32 bit fixed point vector with 2 elements */
using qint32x4_t = int32x4_t; /**< 32 bit fixed point vector with 4 elements */
+using qint32x4x2_t = int32x4x2_t; /**< 32 bit fixed point vector with 8 elements */
/** Get the lower half of a 16 elements vector
*
@@ -673,6 +674,16 @@ qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
*/
qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+/** 16 bit fixed point vector long multiply (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 32 bit fixed point long vector multiplication.
+ */
+qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
+
/** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
*
* @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index c879d3e275..dd1066d6bc 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -624,6 +624,20 @@ inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
return vqrshlq_s16(res, fixed_point_position_s16);
}
+inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
+{
+ const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+ // Initialize the temporary results with a constant used to round up the result
+ qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+ // Vector multiply-accumulate long
+ tmp = vmull_s16(a, b);
+
+ // Shift right by fixed_point_position
+ return vqshlq_s32(tmp, fixed_point_position_s32);
+}
+
inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
{
const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
index f098e18655..87788ba389 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
@@ -51,7 +51,7 @@ public:
/** Set the accumulate buffer and the biases of the kernel.
*
* @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place.
- * Data type supported: QS8/F32
+ * Data type supported: QS8/QS16/F16/F32
* @param[in] bias The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
* @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
* Data type supported: Same as @p input
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index 5612e1ae62..e0dac9858b 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -49,7 +49,7 @@ public:
/** Set the input, weights, and output tensors.
*
* @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32.
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
* The 3rd dimension must be the same as the input's volume 3rd dimension.
* Data type supported:Same as @p input.
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 765cae4ad4..2d3b3d6f66 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -68,6 +68,7 @@ enum class DataType
QS16,
U32,
S32,
+ QS32,
U64,
S64,
F16,
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 4ecd464cdb..af788beeb7 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -100,6 +100,7 @@ inline size_t data_size_from_type(DataType data_type)
case DataType::F32:
case DataType::U32:
case DataType::S32:
+ case DataType::QS32:
return 4;
case DataType::F64:
case DataType::U64:
@@ -173,6 +174,7 @@ inline size_t element_size_from_data_type(DataType dt)
case DataType::U32:
case DataType::S32:
case DataType::F32:
+ case DataType::QS32:
return 4;
default:
ARM_COMPUTE_ERROR("Undefined element size for given data type");
@@ -645,6 +647,7 @@ inline bool is_data_type_fixed_point(DataType dt)
{
case DataType::QS8:
case DataType::QS16:
+ case DataType::QS32:
return true;
default:
return false;