aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2017-07-26 17:09:17 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-09-17 14:16:42 +0100
commitd5e65c71261fd42d3e69478507fbfcc8cf36befc (patch)
tree4892d179782b61f4198b45741d84b7d7fb30a011
parentbaa656d41a9ef9027fca866c890a07b15747feda (diff)
downloadComputeLibrary-d5e65c71261fd42d3e69478507fbfcc8cf36befc.tar.gz
COMPMID-456: Add support for QS16 NEON Normalization Layer.
Change-Id: I1e542808cfd7774c67cc4e9a58e42449e4fb29aa Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81735 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.h25
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.inl17
-rw-r--r--arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h4
-rw-r--r--src/core/NEON/kernels/NENormalizationLayerKernel.cpp131
-rw-r--r--tests/benchmark_new/NEON/NormalizationLayer.cpp4
-rw-r--r--tests/validation_new/CPP/NormalizationLayer.cpp1
-rw-r--r--tests/validation_new/NEON/NormalizationLayer.cpp21
7 files changed, 163 insertions, 40 deletions
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 50463b5efe..08f680801d 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -235,13 +235,22 @@ qint8x16_t vdupq_n_qs8(qint8_t a);
/** Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
*
- * @param[in] a 8 bit fixed point to duplicate
+ * @param[in] a floating point value to convert and duplicate
* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
*
* @return The result of the vector duplication
*/
qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position);
+/** Duplicate a float and convert it to 16 bit fixed point vector (8 elements)
+ *
+ * @param[in] a floating point value to convert and duplicate
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the vector duplication
+ */
+qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position);
+
/** 16 bit fixed point vector duplicate (8 elements)
*
* @param[in] a 16 bit fixed point to duplicate
@@ -1178,7 +1187,19 @@ qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position);
*
* @return The result of the 8bit power.
*/
-qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position);
+qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** Calculate saturating n power for fixed point 16bit (8 elements).
+ *
+ * pow(a,b) = e^(b*log(a))
+ *
+ * @param[in] a 16bit fixed point input vector
+ * @param[in] b 16bit fixed point power vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16bit power.
+ */
+qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
/** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements
*
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 7cebfad924..c879d3e275 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -250,6 +250,18 @@ inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
return vqcvtq_qs8_f32(res, fixed_point_position);
}
+inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
+{
+ float32x4x2_t res =
+ {
+ {
+ vdupq_n_f32(a),
+ vdupq_n_f32(a),
+ }
+ };
+ return vqcvtq_qs16_f32(res, fixed_point_position);
+}
+
inline qint16x8_t vdupq_n_qs16(qint16_t a)
{
return vdupq_n_s16(a);
@@ -1941,6 +1953,11 @@ inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_positio
return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
}
+inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
+{
+ return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
+}
+
inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
{
float32x4x2_t res =
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index b1bc594e4c..e24e481f46 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -50,7 +50,7 @@ public:
/** Set the input and output tensors.
*
* @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
- * and an optional 4th dimension for batch of inputs. Data types supported: QS8/F32.
+ * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
* @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
* Data type supported: same as @p input
* @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -86,7 +86,7 @@ private:
*
* @param[in] window Region on which to execute the kernel.
*/
- template <unsigned int dim, bool do_2D_norm>
+ template <DataType dt, unsigned int dim, bool do_2D_norm>
void normalize_fixed_point(const Window &window);
/** Common signature for all the specialised normalization functions
*
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 76ace91c20..085d412558 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -46,7 +46,7 @@ BorderSize NENormalizationLayerKernel::border_size() const
void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
@@ -118,14 +118,35 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
switch(norm_info.type())
{
case NormType::IN_MAP_1D:
- _func = &NENormalizationLayerKernel::normalize_fixed_point<0, false>;
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, false>;
break;
case NormType::IN_MAP_2D:
// Normalize over X and Y
- _func = &NENormalizationLayerKernel::normalize_fixed_point<0, true>;
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, true>;
break;
case NormType::CROSS_MAP:
- _func = &NENormalizationLayerKernel::normalize_fixed_point<2, false>;
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 2, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
+ break;
+ }
+ case DataType::QS16:
+ {
+ num_elems_processed_per_iteration = 8;
+ switch(norm_info.type())
+ {
+ case NormType::IN_MAP_1D:
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, false>;
+ break;
+ case NormType::IN_MAP_2D:
+ // Normalize over X and Y
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, true>;
+ break;
+ case NormType::CROSS_MAP:
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 2, false>;
break;
default:
ARM_COMPUTE_ERROR("Not supported");
@@ -250,7 +271,7 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
}
}
-template <unsigned int dim, bool do_2D_norm>
+template <DataType dt, unsigned int dim, bool do_2D_norm>
void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
{
Iterator input(_input, window);
@@ -269,40 +290,84 @@ void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
const int fixed_point_position = _input->info()->fixed_point_position();
- const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
- const qint8x16_t beta_vec = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
- const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
+ if(dt == DataType::QS8)
+ {
+ const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
+ const qint8x16_t beta_vec = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
+ const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ const int first_slice = std::max(current_slice - radius, min_left);
+ const int last_slice = std::min(current_slice + radius, max_right);
+
+ // Accumulate 2D In-Map values
+ qint8x16_t accu = vdupq_n_qs8(0);
+ for(int j = first_row; j <= last_row; ++j)
+ {
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for(int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
+ }
+ }
+
+ // Normalize
+ const qint8x16_t accu_scale = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
+ const qint8x16_t normalized = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
+ const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
+ },
+ input, input_squared, output);
+ }
+ else if(dt == DataType::QS16)
{
- // Get range to normalize
- const int current_row = do_2D_norm ? id[dim_y] : 0;
- const int current_slice = id[dim];
- const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
- const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
- const int first_slice = std::max(current_slice - radius, min_left);
- const int last_slice = std::min(current_slice + radius, max_right);
-
- // Accumulate 2D In-Map values
- qint8x16_t accu = vdupq_n_qs8(0);
- for(int j = first_row; j <= last_row; ++j)
+ const qint16x8_t coeff_vec = vdupq_n_qs16_f32(_norm_info.scale_coeff(), fixed_point_position);
+ const qint16x8_t beta_vec = vdupq_n_qs16_f32(_norm_info.beta(), fixed_point_position);
+ const qint16x8_t kappa_vec = vdupq_n_qs16_f32(_norm_info.kappa(), fixed_point_position);
+
+ execute_window_loop(window, [&](const Coordinates & id)
{
- // Compute row displacement
- const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
- const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
- for(int i = first_slice; i <= last_slice; ++i)
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ const int first_slice = std::max(current_slice - radius, min_left);
+ const int last_slice = std::min(current_slice + radius, max_right);
+
+ // Accumulate 2D In-Map values
+ qint16x8_t accu = vdupq_n_qs16(0);
+ for(int j = first_row; j <= last_row; ++j)
{
- accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for(int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = vqaddq_qs16(accu, vld1q_qs16(reinterpret_cast<const qint16_t *>(input_squared_ptr + i * input_squared_stride)));
+ }
}
- }
- // Normalize
- const qint8x16_t accu_scale = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
- const qint8x16_t normalized = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
- const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
- },
- input, input_squared, output);
+ // Normalize
+ const qint16x8_t accu_scale = vqmlaq_qs16(kappa_vec, coeff_vec, accu, fixed_point_position);
+ const qint16x8_t normalized = vqpowq_qs16(accu_scale, beta_vec, fixed_point_position);
+ const qint16x8_t normalized_pixel = vdivq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), normalized, fixed_point_position);
+ vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), normalized_pixel);
+ },
+ input, input_squared, output);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
void NENormalizationLayerKernel::run(const Window &window)
diff --git a/tests/benchmark_new/NEON/NormalizationLayer.cpp b/tests/benchmark_new/NEON/NormalizationLayer.cpp
index 71dd9c354c..de7183d2ec 100644
--- a/tests/benchmark_new/NEON/NormalizationLayer.cpp
+++ b/tests/benchmark_new/NEON/NormalizationLayer.cpp
@@ -41,9 +41,9 @@ namespace test
namespace
{
#ifdef ARM_COMPUTE_ENABLE_FP16
-const auto normalization_layer_data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32, DataType::QS8 });
+const auto normalization_layer_data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::QS16, DataType::F16, DataType::F32 });
#else /* ARM_COMPUTE_ENABLE_FP16 */
-const auto normalization_layer_data_types = framework::dataset::make("DataType", { DataType::F32, DataType::QS8 });
+const auto normalization_layer_data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::QS16, DataType::F32 });
#endif /* ARM_COMPUTE_ENABLE_FP16 */
} // namespace
using NENormalizationLayerFixture = NormalizationLayerFixture<Tensor, NENormalizationLayer, Accessor>;
diff --git a/tests/validation_new/CPP/NormalizationLayer.cpp b/tests/validation_new/CPP/NormalizationLayer.cpp
index 72f49007cc..a8818d8b5c 100644
--- a/tests/validation_new/CPP/NormalizationLayer.cpp
+++ b/tests/validation_new/CPP/NormalizationLayer.cpp
@@ -268,6 +268,7 @@ SimpleTensor<T> normalization_layer(const SimpleTensor<T> &src, NormalizationLay
template SimpleTensor<float> normalization_layer(const SimpleTensor<float> &src, NormalizationLayerInfo info);
template SimpleTensor<half_float::half> normalization_layer(const SimpleTensor<half_float::half> &src, NormalizationLayerInfo info);
template SimpleTensor<qint8_t> normalization_layer(const SimpleTensor<qint8_t> &src, NormalizationLayerInfo info);
+template SimpleTensor<qint16_t> normalization_layer(const SimpleTensor<qint16_t> &src, NormalizationLayerInfo info);
} // namespace reference
} // namespace validation
} // namespace test
diff --git a/tests/validation_new/NEON/NormalizationLayer.cpp b/tests/validation_new/NEON/NormalizationLayer.cpp
index f364975332..dfe793131a 100644
--- a/tests/validation_new/NEON/NormalizationLayer.cpp
+++ b/tests/validation_new/NEON/NormalizationLayer.cpp
@@ -50,7 +50,8 @@ constexpr float tolerance_f16 = 0.001f;
#endif /* ARM_COMPUTE_ENABLE_FP16 */
constexpr float tolerance_f32 = 0.00001f;
/** Tolerance for fixed point operations */
-constexpr int8_t tolerance_qs8 = 2;
+constexpr int8_t tolerance_qs8 = 2;
+constexpr int16_t tolerance_qs16 = 3;
/** Input data set. */
const auto NormalizationDataset = combine(combine(combine(datasets::SmallShapes(), datasets::NormalizationTypes()), framework::dataset::make("NormalizationSize", 3, 9, 2)),
@@ -116,6 +117,24 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixedPointFixture<int8_t>,
validate(Accessor(_target), _reference, tolerance_qs8);
}
TEST_SUITE_END()
+
+TEST_SUITE(QS16)
+// Testing for fixed point position [1,14) as reciprocal limits the maximum fixed point position to 14
+FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(NormalizationDataset, framework::dataset::make("DataType",
+ DataType::QS16)),
+ framework::dataset::make("FractionalBits", 1, 14)))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, tolerance_qs16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixedPointFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(NormalizationDataset, framework::dataset::make("DataType",
+ DataType::QS16)),
+ framework::dataset::make("FractionalBits", 1, 14)))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, tolerance_qs16);
+}
+TEST_SUITE_END()
TEST_SUITE_END()
TEST_SUITE_END()