From f87cc7f6fef95f9b022725304118796a6a764a7c Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Wed, 26 Jul 2017 10:28:40 +0100 Subject: COMPMID-417: Port NEDirectConvolution 1x1 to QS16. Change-Id: Icae6a5091e836d0aca24375f43cca9e6d3a2090f Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81662 Reviewed-by: Moritz Pflanzer Tested-by: Kaizen Reviewed-by: Anthony Barbier --- arm_compute/core/NEON/NEFixedPoint.h | 11 ++ arm_compute/core/NEON/NEFixedPoint.inl | 14 +++ .../NEDirectConvolutionLayerBiasAccumulateKernel.h | 2 +- .../NEON/kernels/NEDirectConvolutionLayerKernel.h | 2 +- arm_compute/core/Types.h | 1 + arm_compute/core/Utils.h | 3 + .../NEON/functions/NEDirectConvolutionLayer.h | 2 +- ...EDirectConvolutionLayerBiasAccumulateKernel.cpp | 84 ++++++++++---- .../kernels/NEDirectConvolutionLayerKernel.cpp | 128 ++++++++++++++++----- .../NEON/functions/NEDirectConvolutionLayer.cpp | 41 +++++-- tests/validation/NEON/DirectConvolutionLayer.cpp | 62 ++++++---- 11 files changed, 263 insertions(+), 87 deletions(-) diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h index 08f680801d..3de226112e 100644 --- a/arm_compute/core/NEON/NEFixedPoint.h +++ b/arm_compute/core/NEON/NEFixedPoint.h @@ -48,6 +48,7 @@ using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 element using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */ using qint32x2_t = int32x2_t; /**< 32 bit fixed point vector with 2 elements */ using qint32x4_t = int32x4_t; /**< 32 bit fixed point vector with 4 elements */ +using qint32x4x2_t = int32x4x2_t; /**< 32 bit fixed point vector with 8 elements */ /** Get the lower half of a 16 elements vector * @@ -673,6 +674,16 @@ qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position); */ qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); +/** 16 bit fixed point vector long multiply (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 32 bit fixed point long vector multiplication. + */ +qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position); + /** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). * * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl index c879d3e275..dd1066d6bc 100644 --- a/arm_compute/core/NEON/NEFixedPoint.inl +++ b/arm_compute/core/NEON/NEFixedPoint.inl @@ -624,6 +624,20 @@ inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position) return vqrshlq_s16(res, fixed_point_position_s16); } +inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position) +{ + const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); + + // Initialize the temporary results with a constant used to round up the result + qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1)); + + // Vector multiply-accumulate long + tmp = vmull_s16(a, b); + + // Shift right by fixed_point_position + return vqshlq_s32(tmp, fixed_point_position_s32); +} + inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position) { const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h index f098e18655..87788ba389 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h @@ -51,7 +51,7 @@ public: /** Set the accumulate buffer and the biases of the kernel. * * @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: QS8/F32 + * Data type supported: QS8/QS16/F16/F32 * @param[in] bias The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input * @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) * Data type supported: Same as @p input diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h index 5612e1ae62..e0dac9858b 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h @@ -49,7 +49,7 @@ public: /** Set the input, weights, and output tensors. * * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32. + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 765cae4ad4..2d3b3d6f66 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -68,6 +68,7 @@ enum class DataType QS16, U32, S32, + QS32, U64, S64, F16, diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h index 4ecd464cdb..af788beeb7 100644 --- a/arm_compute/core/Utils.h +++ b/arm_compute/core/Utils.h @@ -100,6 +100,7 @@ inline size_t data_size_from_type(DataType data_type) case DataType::F32: case DataType::U32: case DataType::S32: + case DataType::QS32: return 4; case DataType::F64: case DataType::U64: @@ -173,6 +174,7 @@ inline size_t element_size_from_data_type(DataType dt) case DataType::U32: case DataType::S32: case DataType::F32: + case DataType::QS32: return 4; default: ARM_COMPUTE_ERROR("Undefined element size for given data type"); @@ -645,6 +647,7 @@ inline bool is_data_type_fixed_point(DataType dt) { case DataType::QS8: case DataType::QS16: + case DataType::QS32: return true; default: return false; diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h index a66cab3013..872fae3a6b 100644 --- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h @@ -48,7 +48,7 @@ public: NEDirectConvolutionLayer(); /** Set the input, weights, biases and output tensors. * - * @param[in, out] input Input tensor. Data types supported: QS8/F16/F32. + * @param[in, out] input Input tensor. Data types supported: QS8/QS16/F16/F32. * @param[in] weights Set of kernels to convolve the input volume. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported: Same as @p input. diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp index fb16c8dcc1..12ef064803 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp @@ -54,6 +54,11 @@ inline qint16x8_t internal_vld1q(const qint16_t *in) return vld1q_qs16(in); } +inline qint32x4_t internal_vld1q(const qint32_t *in) +{ + return vld1q_s32(in); +} + // Internal store inline void internal_vst1q(float *p, const float32x4_t &v) { @@ -72,6 +77,16 @@ inline void internal_vst1q(qint16_t *p, const qint16x8_t &v) vst1q_qs16(p, v); } +inline void internal_vst1q(qint32_t *p, const qint32x4_t &v) +{ + vst1q_s32(p, v); +} + +inline void internal_vst1q(qint16_t *p, const qint32x4_t &v) +{ + vst1_qs16(p, vqmovn_qs32(v)); +} + // Internal vdup inline float32x4_t internal_vdupq_n(float v) { @@ -86,6 +101,11 @@ inline qint16x8_t internal_vdupq_n(qint16_t v) return vdupq_n_qs16(v); } +inline qint32x4_t internal_vdupq_n(qint32_t v) +{ + return vdupq_n_qs32(v); +} + // Internal vadd inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y) { @@ -99,6 +119,10 @@ inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y) { return vqaddq_qs16(x, y); } +inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y) +{ + return vqaddq_qs32(x, y); +} #ifdef ARM_COMPUTE_ENABLE_FP16 inline float16x8_t internal_vld1q(const float16_t *in) @@ -162,8 +186,8 @@ NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumu void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32); ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position()); if(output != nullptr) { @@ -198,27 +222,47 @@ void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, con INEKernel::configure(win); // Set appropriate function - if(input->info()->data_type() == DataType::F32) + switch(input->info()->data_type()) { - _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; - } + case DataType::QS8: + { + _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; + break; + } + case DataType::QS16: + { + if(bias->info()->data_type() == DataType::QS8) + { + _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } + break; + } + case DataType::QS32: + { + _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; + break; + } #ifdef ARM_COMPUTE_ENABLE_FP16 - else if(input->info()->data_type() == DataType::F16) - { - _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; - } + case DataType::F16: + { + _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; + break; + } #endif /* ARM_COMPUTE_ENABLE_FP16 */ - else if(input->info()->data_type() == DataType::QS8) - { - _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; - } - else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8) - { - _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; - } - else - { - ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs."); + case DataType::F32: + { + _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs."); + break; + } } } diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index 09d8dd5642..43292d1b22 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -39,6 +39,34 @@ using namespace arm_compute; namespace { +template +qint16x8_t internal_vld1q(const qint16_t *in); + +template <> +qint16x8_t internal_vld1q<1>(const qint16_t *in) +{ + return vld1q_qs16(in); +} + +template <> +qint16x8_t internal_vld1q<2>(const qint16_t *in) +{ + const int16x8x2_t tmp = vld2q_s16(in); + return tmp.val[0]; +} + +template <> +qint16x8_t internal_vld1q<3>(const qint16_t *in) +{ + const int16x8x3_t tmp = vld3q_s16(in); + return tmp.val[0]; +} + +inline qint16x8_t internal_vdupq_n(qint16_t v) +{ + return vdupq_n_qs16(v); +} + #ifdef ARM_COMPUTE_ENABLE_FP16 template float16x8_t internal_vld1q(const float16_t *in); @@ -109,6 +137,28 @@ float32x4_t internal_vld1q<3>(const float *in) return tmp.val[0]; } +inline float32x4_t internal_vdupq_n(float v) +{ + return vdupq_n_f32(v); +} + +inline void internal_vst1q(float *p, const float32x4_t &v) +{ + vst1q_f32(p, v); +} + +float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + return vmulq_f32(x, y); +} + +inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + return vmlaq_f32(x, y, z); +} + template qint8x8_t internal_vld1q(const qint8_t *in); @@ -132,28 +182,19 @@ qint8x8_t internal_vld1q<3>(const qint8_t *in) return tmp.val[0]; } -template -qint16x8_t internal_vld1q(const qint16_t *in); - -template <> -qint16x8_t internal_vld1q<1>(const qint16_t *in) -{ - return vld1q_s16(in); -} - -inline float32x4_t internal_vdupq_n(float v) +inline qint8x8_t internal_vdupq_n(qint8_t v) { - return vdupq_n_f32(v); + return vdup_n_qs8(v); } -inline qint8x8_t internal_vdupq_n(qint8_t v) +inline qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position) { - return vdup_n_qs8(v); + return vmull_qs8(x, y, fixed_point_position); } -inline void internal_vst1q(float *p, const float32x4_t &v) +inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position) { - vst1q_f32(p, v); + return vqmlal_qs8(x, y, z, fixed_point_position); } inline void internal_vst1q(qint16_t *p, const qint16x8_t &v) @@ -161,26 +202,50 @@ inline void internal_vst1q(qint16_t *p, const qint16x8_t &v) vst1q_qs16(p, v); } -float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position) +inline void internal_vst1q(int *p, const qint32x4x2_t &v) { - ARM_COMPUTE_UNUSED(fixed_point_position); - return vmulq_f32(x, y); + vst1q_s32(p, v.val[0]); + vst1q_s32(p + 4, v.val[1]); } -qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position) +template +qint32x4x2_t internal_vld1q(const qint32_t *in); + +template <> +qint32x4x2_t internal_vld1q<1>(const qint32_t *in) { - return vmull_qs8(x, y, fixed_point_position); + const qint32x4x2_t r = + { + { + vld1q_s32(in), + vld1q_s32(in + 4) + } + }; + return r; } -inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position) +inline qint32x4x2_t internal_vmull(const qint16x8_t &x, const qint16x8_t &y, int fixed_point_position) { - ARM_COMPUTE_UNUSED(fixed_point_position); - return vmlaq_f32(x, y, z); + const qint32x4x2_t r = + { + { + vmull_qs16(vget_low_s16(x), vget_low_s16(y), fixed_point_position), + vmull_qs16(vget_high_s16(x), vget_high_s16(y), fixed_point_position), + } + }; + return r; } -inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position) +inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, const qint16x8_t &z, int fixed_point_position) { - return vqmlal_qs8(x, y, z, fixed_point_position); + const qint32x4x2_t r = + { + { + vqmlal_qs16(x.val[0], vget_low_s16(y), vget_low_s16(z), fixed_point_position), + vqmlal_qs16(x.val[1], vget_high_s16(y), vget_high_s16(z), fixed_point_position) + } + }; + return r; } template @@ -216,8 +281,7 @@ public: window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Window window_k = calculate_max_window(*weights->info(), Steps(1u)); - + Window window_k = calculate_max_window(*weights->info(), Steps(1u)); Iterator out(output, window_out); Iterator in(input, window_in); Iterator k(weights, window_k); @@ -887,9 +951,9 @@ BorderSize NEDirectConvolutionLayerKernel::border_size() const void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32); ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())), "Pad > 0 not supported for 1x1 weights"); ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1), @@ -919,6 +983,7 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens case DataType::F16: #endif /* ARM_COMPUTE_ENABLE_FP16 */ case DataType::QS8: + case DataType::QS16: _num_elems_written_per_iteration = 8; break; case DataType::F32: @@ -995,6 +1060,9 @@ void NEDirectConvolutionLayerKernel::run(const Window &window) case DataType::QS8: convolve_1x1(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); break; + case DataType::QS16: + convolve_1x1(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + break; case DataType::F32: convolve_1x1(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); break; diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index d5f03fcc41..0380e8cdb4 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -40,7 +40,7 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer() void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); // Free accumulator if(_accumulator.buffer() != nullptr) @@ -49,17 +49,36 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, } // Allocate the intermediate accumulator tensor in case of fixed point input - if(output->info()->data_type() == DataType::QS8) + switch(output->info()->data_type()) { - _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position())); - _conv_kernel.configure(input, weights, &_accumulator, conv_info); - _accumulate_bias_kernel.configure(&_accumulator, bias, output); - _accumulator.allocator()->allocate(); - } - else - { - _conv_kernel.configure(input, weights, output, conv_info); - _accumulate_bias_kernel.configure(output, bias); + case DataType::QS8: + { + _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position())); + _conv_kernel.configure(input, weights, &_accumulator, conv_info); + _accumulate_bias_kernel.configure(&_accumulator, bias, output); + _accumulator.allocator()->allocate(); + break; + } + case DataType::QS16: + { + _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position())); + _conv_kernel.configure(input, weights, &_accumulator, conv_info); + _accumulate_bias_kernel.configure(&_accumulator, bias, output); + _accumulator.allocator()->allocate(); + break; + } + case DataType::F16: + case DataType::F32: + { + _conv_kernel.configure(input, weights, output, conv_info); + _accumulate_bias_kernel.configure(output, bias); + break; + } + default: + { + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } } // Add zero padding XY diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp index effb898428..7022d656e9 100644 --- a/tests/validation/NEON/DirectConvolutionLayer.cpp +++ b/tests/validation/NEON/DirectConvolutionLayer.cpp @@ -48,11 +48,11 @@ using namespace arm_compute::test::validation; namespace { -const float tolerance_fp32 = 1e-3f; /**< Tolerance for floating point tests */ +const float tolerance_qs = 1.f; /**< Tolerance for 8 bit fixed point tests */ #ifdef ARM_COMPUTE_ENABLE_FP16 const float tolerance_fp16 = 0.01f; /**< Tolerance for half precision floating point tests */ #endif /* ARM_COMPUTE_ENABLE_FP16 */ -const float tolerance_qs8 = 1; /**< Tolerance for fixed point tests */ +const float tolerance_fp32 = 1e-3f; /**< Tolerance for floating point tests */ /** Compute NEON direct convolution layer function. * @@ -91,18 +91,30 @@ Tensor compute_convolution_layer(const TensorShape &src_shape, const TensorShape BOOST_TEST(!dst.info()->is_resizable()); // Fill tensors - if(dt == DataType::F16 || dt == DataType::F32) - { - std::uniform_real_distribution<> distribution(-1.f, 1.f); - library->fill(Accessor(src), distribution, 0); - library->fill(Accessor(weights), distribution, 1); - library->fill(Accessor(bias), distribution, 2); - } - else + switch(dt) { - library->fill_tensor_uniform(Accessor(src), 0); - library->fill_tensor_uniform(Accessor(weights), 1); - library->fill_tensor_uniform(Accessor(bias), 2); + case DataType::F16: + case DataType::F32: + { + std::uniform_real_distribution<> distribution(-1.f, 1.f); + library->fill(Accessor(src), distribution, 0); + library->fill(Accessor(weights), distribution, 1); + library->fill(Accessor(bias), distribution, 2); + break; + } + case DataType::QS8: + case DataType::QS16: + { + library->fill_tensor_uniform(Accessor(src), 0); + library->fill_tensor_uniform(Accessor(weights), 1); + library->fill_tensor_uniform(Accessor(bias), 2); + break; + } + default: + { + ARM_COMPUTE_ERROR("Data type not supported."); + break; + } } // Compute function @@ -221,8 +233,10 @@ BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE(Quantized) BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) BOOST_DATA_TEST_CASE(W1x1, - DirectConvolutionShapes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }), - input_shape, sx, sy, num_kernels, fixed_point_position) + DirectConvolutionShapes() * boost::unit_test::data::make({ DataType::QS8, DataType::QS16 }) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, + 1) + * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }), + input_shape, dt, sx, sy, num_kernels, fixed_point_position) { const unsigned int kernel_size = 1; const PadStrideInfo conv_info(sx, sy, 0, 0, DimensionRoundingType::FLOOR); @@ -230,18 +244,20 @@ BOOST_DATA_TEST_CASE(W1x1, const TensorShape b_shape(static_cast(num_kernels)); const TensorShape d_shape(get_output_shape(input_shape, w_shape, conv_info)); - Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position); + Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position); - RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position); + RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position); // Validate output - validate(Accessor(dst), ref); + validate(Accessor(dst), ref, tolerance_qs); } BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) -BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(0, 2, 1) +BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::make(DataType::QS8) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, + 1) + * boost::unit_test::data::xrange(0, 2, 1) * boost::unit_test::data::xrange(0, 2, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }), - input_shape, sx, sy, px, py, num_kernels, fixed_point_position) + input_shape, dt, sx, sy, px, py, num_kernels, fixed_point_position) { const unsigned int kernel_size = 3; const PadStrideInfo conv_info(sx, sy, px, py, DimensionRoundingType::FLOOR); @@ -249,12 +265,12 @@ BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::x const TensorShape b_shape(static_cast(num_kernels)); const TensorShape d_shape(get_output_shape(input_shape, w_shape, conv_info)); - Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position); + Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position); - RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position); + RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, fixed_point_position); // Validate output - validate(Accessor(dst), ref, tolerance_qs8); + validate(Accessor(dst), ref, tolerance_qs); } BOOST_AUTO_TEST_SUITE_END() -- cgit v1.2.1