aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.h11
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.inl14
-rw-r--r--arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h2
-rw-r--r--arm_compute/core/Types.h1
-rw-r--r--arm_compute/core/Utils.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h2
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp84
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp128
-rw-r--r--src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp41
-rw-r--r--tests/validation/NEON/DirectConvolutionLayer.cpp62
11 files changed, 263 insertions, 87 deletions
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 08f680801d..3de226112e 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -48,6 +48,7 @@ using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 element
using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
using qint32x2_t = int32x2_t; /**< 32 bit fixed point vector with 2 elements */
using qint32x4_t = int32x4_t; /**< 32 bit fixed point vector with 4 elements */
+using qint32x4x2_t = int32x4x2_t; /**< 32 bit fixed point vector with 8 elements */
/** Get the lower half of a 16 elements vector
*
@@ -673,6 +674,16 @@ qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
*/
qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+/** 16 bit fixed point vector long multiply (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 32 bit fixed point long vector multiplication.
+ */
+qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
+
/** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
*
* @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index c879d3e275..dd1066d6bc 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -624,6 +624,20 @@ inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
return vqrshlq_s16(res, fixed_point_position_s16);
}
+inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
+{
+ const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+ // Initialize the temporary results with a constant used to round up the result
+ qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+ // Vector multiply-accumulate long
+ tmp = vmull_s16(a, b);
+
+ // Shift right by fixed_point_position
+ return vqshlq_s32(tmp, fixed_point_position_s32);
+}
+
inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
{
const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
index f098e18655..87788ba389 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
@@ -51,7 +51,7 @@ public:
/** Set the accumulate buffer and the biases of the kernel.
*
* @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place.
- * Data type supported: QS8/F32
+ * Data type supported: QS8/QS16/F16/F32
* @param[in] bias The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
* @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
* Data type supported: Same as @p input
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index 5612e1ae62..e0dac9858b 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -49,7 +49,7 @@ public:
/** Set the input, weights, and output tensors.
*
* @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32.
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
* The 3rd dimension must be the same as the input's volume 3rd dimension.
* Data type supported:Same as @p input.
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 765cae4ad4..2d3b3d6f66 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -68,6 +68,7 @@ enum class DataType
QS16,
U32,
S32,
+ QS32,
U64,
S64,
F16,
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 4ecd464cdb..af788beeb7 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -100,6 +100,7 @@ inline size_t data_size_from_type(DataType data_type)
case DataType::F32:
case DataType::U32:
case DataType::S32:
+ case DataType::QS32:
return 4;
case DataType::F64:
case DataType::U64:
@@ -173,6 +174,7 @@ inline size_t element_size_from_data_type(DataType dt)
case DataType::U32:
case DataType::S32:
case DataType::F32:
+ case DataType::QS32:
return 4;
default:
ARM_COMPUTE_ERROR("Undefined element size for given data type");
@@ -645,6 +647,7 @@ inline bool is_data_type_fixed_point(DataType dt)
{
case DataType::QS8:
case DataType::QS16:
+ case DataType::QS32:
return true;
default:
return false;
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index a66cab3013..872fae3a6b 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -48,7 +48,7 @@ public:
NEDirectConvolutionLayer();
/** Set the input, weights, biases and output tensors.
*
- * @param[in, out] input Input tensor. Data types supported: QS8/F16/F32.
+ * @param[in, out] input Input tensor. Data types supported: QS8/QS16/F16/F32.
* @param[in] weights Set of kernels to convolve the input volume.
* The 3rd dimension must be the same as the input's volume 3rd dimension.
* Data type supported: Same as @p input.
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
index fb16c8dcc1..12ef064803 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
@@ -54,6 +54,11 @@ inline qint16x8_t internal_vld1q(const qint16_t *in)
return vld1q_qs16(in);
}
+inline qint32x4_t internal_vld1q(const qint32_t *in)
+{
+ return vld1q_s32(in);
+}
+
// Internal store
inline void internal_vst1q(float *p, const float32x4_t &v)
{
@@ -72,6 +77,16 @@ inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
vst1q_qs16(p, v);
}
+inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
+{
+ vst1q_s32(p, v);
+}
+
+inline void internal_vst1q(qint16_t *p, const qint32x4_t &v)
+{
+ vst1_qs16(p, vqmovn_qs32(v));
+}
+
// Internal vdup
inline float32x4_t internal_vdupq_n(float v)
{
@@ -86,6 +101,11 @@ inline qint16x8_t internal_vdupq_n(qint16_t v)
return vdupq_n_qs16(v);
}
+inline qint32x4_t internal_vdupq_n(qint32_t v)
+{
+ return vdupq_n_qs32(v);
+}
+
// Internal vadd
inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
{
@@ -99,6 +119,10 @@ inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
{
return vqaddq_qs16(x, y);
}
+inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y)
+{
+ return vqaddq_qs32(x, y);
+}
#ifdef ARM_COMPUTE_ENABLE_FP16
inline float16x8_t internal_vld1q(const float16_t *in)
@@ -162,8 +186,8 @@ NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumu
void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
if(output != nullptr)
{
@@ -198,27 +222,47 @@ void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, con
INEKernel::configure(win);
// Set appropriate function
- if(input->info()->data_type() == DataType::F32)
+ switch(input->info()->data_type())
{
- _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
- }
+ case DataType::QS8:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
+ break;
+ }
+ case DataType::QS16:
+ {
+ if(bias->info()->data_type() == DataType::QS8)
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+ break;
+ }
+ case DataType::QS32:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint32_t, qint16_t, true> : &accumulate_bias<qint32_t, qint16_t, false>;
+ break;
+ }
#ifdef ARM_COMPUTE_ENABLE_FP16
- else if(input->info()->data_type() == DataType::F16)
- {
- _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
- }
+ case DataType::F16:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
+ break;
+ }
#endif /* ARM_COMPUTE_ENABLE_FP16 */
- else if(input->info()->data_type() == DataType::QS8)
- {
- _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
- }
- else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8)
- {
- _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+ case DataType::F32:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+ break;
+ }
}
}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 09d8dd5642..43292d1b22 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -39,6 +39,34 @@ using namespace arm_compute;
namespace
{
+template <unsigned int stridex>
+qint16x8_t internal_vld1q(const qint16_t *in);
+
+template <>
+qint16x8_t internal_vld1q<1>(const qint16_t *in)
+{
+ return vld1q_qs16(in);
+}
+
+template <>
+qint16x8_t internal_vld1q<2>(const qint16_t *in)
+{
+ const int16x8x2_t tmp = vld2q_s16(in);
+ return tmp.val[0];
+}
+
+template <>
+qint16x8_t internal_vld1q<3>(const qint16_t *in)
+{
+ const int16x8x3_t tmp = vld3q_s16(in);
+ return tmp.val[0];
+}
+
+inline qint16x8_t internal_vdupq_n(qint16_t v)
+{
+ return vdupq_n_qs16(v);
+}
+
#ifdef ARM_COMPUTE_ENABLE_FP16
template <unsigned int stridex>
float16x8_t internal_vld1q(const float16_t *in);
@@ -109,6 +137,28 @@ float32x4_t internal_vld1q<3>(const float *in)
return tmp.val[0];
}
+inline float32x4_t internal_vdupq_n(float v)
+{
+ return vdupq_n_f32(v);
+}
+
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+ vst1q_f32(p, v);
+}
+
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vmulq_f32(x, y);
+}
+
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vmlaq_f32(x, y, z);
+}
+
template <unsigned int stridex>
qint8x8_t internal_vld1q(const qint8_t *in);
@@ -132,28 +182,19 @@ qint8x8_t internal_vld1q<3>(const qint8_t *in)
return tmp.val[0];
}
-template <unsigned int stridex>
-qint16x8_t internal_vld1q(const qint16_t *in);
-
-template <>
-qint16x8_t internal_vld1q<1>(const qint16_t *in)
-{
- return vld1q_s16(in);
-}
-
-inline float32x4_t internal_vdupq_n(float v)
+inline qint8x8_t internal_vdupq_n(qint8_t v)
{
- return vdupq_n_f32(v);
+ return vdup_n_qs8(v);
}
-inline qint8x8_t internal_vdupq_n(qint8_t v)
+inline qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
{
- return vdup_n_qs8(v);
+ return vmull_qs8(x, y, fixed_point_position);
}
-inline void internal_vst1q(float *p, const float32x4_t &v)
+inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
{
- vst1q_f32(p, v);
+ return vqmlal_qs8(x, y, z, fixed_point_position);
}
inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
@@ -161,26 +202,50 @@ inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
vst1q_qs16(p, v);
}
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+inline void internal_vst1q(int *p, const qint32x4x2_t &v)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
- return vmulq_f32(x, y);
+ vst1q_s32(p, v.val[0]);
+ vst1q_s32(p + 4, v.val[1]);
}
-qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
+template <unsigned int stridex>
+qint32x4x2_t internal_vld1q(const qint32_t *in);
+
+template <>
+qint32x4x2_t internal_vld1q<1>(const qint32_t *in)
{
- return vmull_qs8(x, y, fixed_point_position);
+ const qint32x4x2_t r =
+ {
+ {
+ vld1q_s32(in),
+ vld1q_s32(in + 4)
+ }
+ };
+ return r;
}
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+inline qint32x4x2_t internal_vmull(const qint16x8_t &x, const qint16x8_t &y, int fixed_point_position)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
- return vmlaq_f32(x, y, z);
+ const qint32x4x2_t r =
+ {
+ {
+ vmull_qs16(vget_low_s16(x), vget_low_s16(y), fixed_point_position),
+ vmull_qs16(vget_high_s16(x), vget_high_s16(y), fixed_point_position),
+ }
+ };
+ return r;
}
-inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, const qint16x8_t &z, int fixed_point_position)
{
- return vqmlal_qs8(x, y, z, fixed_point_position);
+ const qint32x4x2_t r =
+ {
+ {
+ vqmlal_qs16(x.val[0], vget_low_s16(y), vget_low_s16(z), fixed_point_position),
+ vqmlal_qs16(x.val[1], vget_high_s16(y), vget_high_s16(z), fixed_point_position)
+ }
+ };
+ return r;
}
template <typename T1, typename T2, unsigned int stridex>
@@ -216,8 +281,7 @@ public:
window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
+ Window window_k = calculate_max_window(*weights->info(), Steps(1u));
Iterator out(output, window_out);
Iterator in(input, window_in);
Iterator k(weights, window_k);
@@ -887,9 +951,9 @@ BorderSize NEDirectConvolutionLayerKernel::border_size() const
void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
"Pad > 0 not supported for 1x1 weights");
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
@@ -919,6 +983,7 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens
case DataType::F16:
#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::QS8:
+ case DataType::QS16:
_num_elems_written_per_iteration = 8;
break;
case DataType::F32:
@@ -995,6 +1060,9 @@ void NEDirectConvolutionLayerKernel::run(const Window &window)
case DataType::QS8:
convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
break;
+ case DataType::QS16:
+ convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
case DataType::F32:
convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
break;
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index d5f03fcc41..0380e8cdb4 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -40,7 +40,7 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer()
void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
// Free accumulator
if(_accumulator.buffer() != nullptr)
@@ -49,17 +49,36 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights,
}
// Allocate the intermediate accumulator tensor in case of fixed point input
- if(output->info()->data_type() == DataType::QS8)
+ switch(output->info()->data_type())
{
- _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
- _conv_kernel.configure(input, weights, &_accumulator, conv_info);
- _accumulate_bias_kernel.configure(&_accumulator, bias, output);
- _accumulator.allocator()->allocate();
- }
- else
- {
- _conv_kernel.configure(input, weights, output, conv_info);
- _accumulate_bias_kernel.configure(output, bias);
+ case DataType::QS8:
+ {
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+ _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+ _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+ _accumulator.allocator()->allocate();
+ break;
+ }
+ case DataType::QS16:
+ {
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position()));
+ _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+ _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+ _accumulator.allocator()->allocate();
+ break;
+ }
+ case DataType::F16:
+ case DataType::F32:
+ {
+ _conv_kernel.configure(input, weights, output, conv_info);
+ _accumulate_bias_kernel.configure(output, bias);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
}
// Add zero padding XY
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index effb898428..7022d656e9 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -48,11 +48,11 @@ using namespace arm_compute::test::validation;
namespace
{
-const float tolerance_fp32 = 1e-3f; /**< Tolerance for floating point tests */
+const float tolerance_qs = 1.f; /**< Tolerance for 8 bit fixed point tests */
#ifdef ARM_COMPUTE_ENABLE_FP16
const float tolerance_fp16 = 0.01f; /**< Tolerance for half precision floating point tests */
#endif /* ARM_COMPUTE_ENABLE_FP16 */
-const float tolerance_qs8 = 1; /**< Tolerance for fixed point tests */
+const float tolerance_fp32 = 1e-3f; /**< Tolerance for floating point tests */
/** Compute NEON direct convolution layer function.
*
@@ -91,18 +91,30 @@ Tensor compute_convolution_layer(const TensorShape &src_shape, const TensorShape
BOOST_TEST(!dst.info()->is_resizable());
// Fill tensors
- if(dt == DataType::F16 || dt == DataType::F32)
- {
- std::uniform_real_distribution<> distribution(-1.f, 1.f);
- library->fill(Accessor(src), distribution, 0);
- library->fill(Accessor(weights), distribution, 1);
- library->fill(Accessor(bias), distribution, 2);
- }
- else
+ switch(dt)
{
- library->fill_tensor_uniform(Accessor(src), 0);
- library->fill_tensor_uniform(Accessor(weights), 1);
- library->fill_tensor_uniform(Accessor(bias), 2);
+ case DataType::F16:
+ case DataType::F32:
+ {
+ std::uniform_real_distribution<> distribution(-1.f, 1.f);
+ library->fill(Accessor(src), distribution, 0);
+ library->fill(Accessor(weights), distribution, 1);
+ library->fill(Accessor(bias), distribution, 2);
+ break;
+ }
+ case DataType::QS8:
+ case DataType::QS16:
+ {
+ library->fill_tensor_uniform(Accessor(src), 0);
+ library->fill_tensor_uniform(Accessor(weights), 1);
+ library->fill_tensor_uniform(Accessor(bias), 2);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Data type not supported.");
+ break;
+ }
}
// Compute function
@@ -221,8 +233,10 @@ BOOST_AUTO_TEST_SUITE_END()
BOOST_AUTO_TEST_SUITE(Quantized)
BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
BOOST_DATA_TEST_CASE(W1x1,
- DirectConvolutionShapes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }),
- input_shape, sx, sy, num_kernels, fixed_point_position)
+ DirectConvolutionShapes() * boost::unit_test::data::make({ DataType::QS8, DataType::QS16 }) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3,
+ 1)
+ * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }),
+ input_shape, dt, sx, sy, num_kernels, fixed_point_position)
{
const unsigned int kernel_size = 1;
const PadStrideInfo conv_info(sx, sy, 0, 0, DimensionRoundingType::FLOOR);
@@ -230,18 +244,20 @@ BOOST_DATA_TEST_CASE(W1x1,
const TensorShape b_shape(static_cast<unsigned int>(num_kernels));
const TensorShape d_shape(get_output_shape(input_shape, w_shape, conv_info));
- Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+ Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position);
- RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+ RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position);
// Validate output
- validate(Accessor(dst), ref);
+ validate(Accessor(dst), ref, tolerance_qs);
}
BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
-BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(0, 2, 1)
+BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::make(DataType::QS8) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3,
+ 1)
+ * boost::unit_test::data::xrange(0, 2, 1)
* boost::unit_test::data::xrange(0, 2, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }),
- input_shape, sx, sy, px, py, num_kernels, fixed_point_position)
+ input_shape, dt, sx, sy, px, py, num_kernels, fixed_point_position)
{
const unsigned int kernel_size = 3;
const PadStrideInfo conv_info(sx, sy, px, py, DimensionRoundingType::FLOOR);
@@ -249,12 +265,12 @@ BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::x
const TensorShape b_shape(static_cast<unsigned int>(num_kernels));
const TensorShape d_shape(get_output_shape(input_shape, w_shape, conv_info));
- Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+ Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position);
- RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+ RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position);
// Validate output
- validate(Accessor(dst), ref, tolerance_qs8);
+ validate(Accessor(dst), ref, tolerance_qs);
}
BOOST_AUTO_TEST_SUITE_END()