aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2017-07-26 10:28:40 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-09-17 14:16:42 +0100
commitf87cc7f6fef95f9b022725304118796a6a764a7c (patch)
tree06a643c47c93ba1a64dcca1ae787214a6fbfff54 /src
parent6c928343b0fa2bf60ffdfe21aea28b598d742ed4 (diff)
downloadComputeLibrary-f87cc7f6fef95f9b022725304118796a6a764a7c.tar.gz
COMPMID-417: Port NEDirectConvolution 1x1 to QS16.
Change-Id: Icae6a5091e836d0aca24375f43cca9e6d3a2090f Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81662 Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp84
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp128
-rw-r--r--src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp41
3 files changed, 192 insertions, 61 deletions
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
index fb16c8dcc1..12ef064803 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
@@ -54,6 +54,11 @@ inline qint16x8_t internal_vld1q(const qint16_t *in)
return vld1q_qs16(in);
}
+inline qint32x4_t internal_vld1q(const qint32_t *in)
+{
+ return vld1q_s32(in);
+}
+
// Internal store
inline void internal_vst1q(float *p, const float32x4_t &v)
{
@@ -72,6 +77,16 @@ inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
vst1q_qs16(p, v);
}
+inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
+{
+ vst1q_s32(p, v);
+}
+
+inline void internal_vst1q(qint16_t *p, const qint32x4_t &v)
+{
+ vst1_qs16(p, vqmovn_qs32(v));
+}
+
// Internal vdup
inline float32x4_t internal_vdupq_n(float v)
{
@@ -86,6 +101,11 @@ inline qint16x8_t internal_vdupq_n(qint16_t v)
return vdupq_n_qs16(v);
}
+inline qint32x4_t internal_vdupq_n(qint32_t v)
+{
+ return vdupq_n_qs32(v);
+}
+
// Internal vadd
inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
{
@@ -99,6 +119,10 @@ inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
{
return vqaddq_qs16(x, y);
}
+inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y)
+{
+ return vqaddq_qs32(x, y);
+}
#ifdef ARM_COMPUTE_ENABLE_FP16
inline float16x8_t internal_vld1q(const float16_t *in)
@@ -162,8 +186,8 @@ NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumu
void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
if(output != nullptr)
{
@@ -198,27 +222,47 @@ void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, con
INEKernel::configure(win);
// Set appropriate function
- if(input->info()->data_type() == DataType::F32)
+ switch(input->info()->data_type())
{
- _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
- }
+ case DataType::QS8:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
+ break;
+ }
+ case DataType::QS16:
+ {
+ if(bias->info()->data_type() == DataType::QS8)
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+ break;
+ }
+ case DataType::QS32:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint32_t, qint16_t, true> : &accumulate_bias<qint32_t, qint16_t, false>;
+ break;
+ }
#ifdef ARM_COMPUTE_ENABLE_FP16
- else if(input->info()->data_type() == DataType::F16)
- {
- _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
- }
+ case DataType::F16:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
+ break;
+ }
#endif /* ARM_COMPUTE_ENABLE_FP16 */
- else if(input->info()->data_type() == DataType::QS8)
- {
- _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
- }
- else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8)
- {
- _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+ case DataType::F32:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+ break;
+ }
}
}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 09d8dd5642..43292d1b22 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -39,6 +39,34 @@ using namespace arm_compute;
namespace
{
+template <unsigned int stridex>
+qint16x8_t internal_vld1q(const qint16_t *in);
+
+template <>
+qint16x8_t internal_vld1q<1>(const qint16_t *in)
+{
+ return vld1q_qs16(in);
+}
+
+template <>
+qint16x8_t internal_vld1q<2>(const qint16_t *in)
+{
+ const int16x8x2_t tmp = vld2q_s16(in);
+ return tmp.val[0];
+}
+
+template <>
+qint16x8_t internal_vld1q<3>(const qint16_t *in)
+{
+ const int16x8x3_t tmp = vld3q_s16(in);
+ return tmp.val[0];
+}
+
+inline qint16x8_t internal_vdupq_n(qint16_t v)
+{
+ return vdupq_n_qs16(v);
+}
+
#ifdef ARM_COMPUTE_ENABLE_FP16
template <unsigned int stridex>
float16x8_t internal_vld1q(const float16_t *in);
@@ -109,6 +137,28 @@ float32x4_t internal_vld1q<3>(const float *in)
return tmp.val[0];
}
+inline float32x4_t internal_vdupq_n(float v)
+{
+ return vdupq_n_f32(v);
+}
+
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+ vst1q_f32(p, v);
+}
+
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vmulq_f32(x, y);
+}
+
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vmlaq_f32(x, y, z);
+}
+
template <unsigned int stridex>
qint8x8_t internal_vld1q(const qint8_t *in);
@@ -132,28 +182,19 @@ qint8x8_t internal_vld1q<3>(const qint8_t *in)
return tmp.val[0];
}
-template <unsigned int stridex>
-qint16x8_t internal_vld1q(const qint16_t *in);
-
-template <>
-qint16x8_t internal_vld1q<1>(const qint16_t *in)
-{
- return vld1q_s16(in);
-}
-
-inline float32x4_t internal_vdupq_n(float v)
+inline qint8x8_t internal_vdupq_n(qint8_t v)
{
- return vdupq_n_f32(v);
+ return vdup_n_qs8(v);
}
-inline qint8x8_t internal_vdupq_n(qint8_t v)
+inline qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
{
- return vdup_n_qs8(v);
+ return vmull_qs8(x, y, fixed_point_position);
}
-inline void internal_vst1q(float *p, const float32x4_t &v)
+inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
{
- vst1q_f32(p, v);
+ return vqmlal_qs8(x, y, z, fixed_point_position);
}
inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
@@ -161,26 +202,50 @@ inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
vst1q_qs16(p, v);
}
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+inline void internal_vst1q(int *p, const qint32x4x2_t &v)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
- return vmulq_f32(x, y);
+ vst1q_s32(p, v.val[0]);
+ vst1q_s32(p + 4, v.val[1]);
}
-qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
+template <unsigned int stridex>
+qint32x4x2_t internal_vld1q(const qint32_t *in);
+
+template <>
+qint32x4x2_t internal_vld1q<1>(const qint32_t *in)
{
- return vmull_qs8(x, y, fixed_point_position);
+ const qint32x4x2_t r =
+ {
+ {
+ vld1q_s32(in),
+ vld1q_s32(in + 4)
+ }
+ };
+ return r;
}
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+inline qint32x4x2_t internal_vmull(const qint16x8_t &x, const qint16x8_t &y, int fixed_point_position)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
- return vmlaq_f32(x, y, z);
+ const qint32x4x2_t r =
+ {
+ {
+ vmull_qs16(vget_low_s16(x), vget_low_s16(y), fixed_point_position),
+ vmull_qs16(vget_high_s16(x), vget_high_s16(y), fixed_point_position),
+ }
+ };
+ return r;
}
-inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, const qint16x8_t &z, int fixed_point_position)
{
- return vqmlal_qs8(x, y, z, fixed_point_position);
+ const qint32x4x2_t r =
+ {
+ {
+ vqmlal_qs16(x.val[0], vget_low_s16(y), vget_low_s16(z), fixed_point_position),
+ vqmlal_qs16(x.val[1], vget_high_s16(y), vget_high_s16(z), fixed_point_position)
+ }
+ };
+ return r;
}
template <typename T1, typename T2, unsigned int stridex>
@@ -216,8 +281,7 @@ public:
window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
+ Window window_k = calculate_max_window(*weights->info(), Steps(1u));
Iterator out(output, window_out);
Iterator in(input, window_in);
Iterator k(weights, window_k);
@@ -887,9 +951,9 @@ BorderSize NEDirectConvolutionLayerKernel::border_size() const
void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
"Pad > 0 not supported for 1x1 weights");
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
@@ -919,6 +983,7 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens
case DataType::F16:
#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::QS8:
+ case DataType::QS16:
_num_elems_written_per_iteration = 8;
break;
case DataType::F32:
@@ -995,6 +1060,9 @@ void NEDirectConvolutionLayerKernel::run(const Window &window)
case DataType::QS8:
convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
break;
+ case DataType::QS16:
+ convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
case DataType::F32:
convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
break;
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index d5f03fcc41..0380e8cdb4 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -40,7 +40,7 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer()
void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
// Free accumulator
if(_accumulator.buffer() != nullptr)
@@ -49,17 +49,36 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights,
}
// Allocate the intermediate accumulator tensor in case of fixed point input
- if(output->info()->data_type() == DataType::QS8)
+ switch(output->info()->data_type())
{
- _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
- _conv_kernel.configure(input, weights, &_accumulator, conv_info);
- _accumulate_bias_kernel.configure(&_accumulator, bias, output);
- _accumulator.allocator()->allocate();
- }
- else
- {
- _conv_kernel.configure(input, weights, output, conv_info);
- _accumulate_bias_kernel.configure(output, bias);
+ case DataType::QS8:
+ {
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+ _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+ _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+ _accumulator.allocator()->allocate();
+ break;
+ }
+ case DataType::QS16:
+ {
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position()));
+ _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+ _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+ _accumulator.allocator()->allocate();
+ break;
+ }
+ case DataType::F16:
+ case DataType::F32:
+ {
+ _conv_kernel.configure(input, weights, output, conv_info);
+ _accumulate_bias_kernel.configure(output, bias);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
}
// Add zero padding XY