aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2017-06-22 12:57:51 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commitbbd9fb95daa08d6da67c567b40ca2cd032f7a2d3 (patch)
treec1401585f64396d6f22bb790442d8183f3a17a9e
parent2eac5bd444d16e4e81c427d5a99e1534b387e211 (diff)
downloadComputeLibrary-bbd9fb95daa08d6da67c567b40ca2cd032f7a2d3.tar.gz
COMPMID-412: Port PoolingLayer to use fixed point 16.
Change-Id: I2005de4c7c14526996309826d33a0ec8e732d2d5 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78720 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Steven Niu <steven.niu@arm.com>
-rw-r--r--arm_compute/core/FixedPoint.h18
-rw-r--r--arm_compute/core/FixedPoint.inl16
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.inl40
-rw-r--r--arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h16
-rw-r--r--src/core/NEON/kernels/NEPoolingLayerKernel.cpp228
-rw-r--r--tests/Utils.h7
-rw-r--r--tests/dataset/PoolingLayerDataset.h4
-rw-r--r--tests/validation/NEON/PoolingLayer.cpp20
-rw-r--r--tests/validation/Reference.cpp3
-rw-r--r--tests/validation/ReferenceCPP.cpp4
-rw-r--r--tests/validation/ReferenceCPP.h9
-rw-r--r--tests/validation/TensorOperations.h135
-rw-r--r--tests/validation/TensorVisitors.h8
13 files changed, 440 insertions, 68 deletions
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
index f166d93c3e..82c2d3347e 100644
--- a/arm_compute/core/FixedPoint.h
+++ b/arm_compute/core/FixedPoint.h
@@ -40,6 +40,24 @@ using qint64_t = int64_t; /**< 64 bit fixed point scalar value */
*/
qint8_t sqshl_qs8(qint8_t a, int shift);
+/** 8 bit fixed point scalar shift right
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] shift Shift amount (positive only values)
+ *
+ * @return The result of the 8 bit fixed point shift
+ */
+qint8_t sshr_qs8(qint8_t a, int shift);
+
+/** 16 bit fixed point scalar shift right
+ *
+ * @param[in] a First 16 bit fixed point input
+ * @param[in] shift Shift amount (positive only values)
+ *
+ * @return The result of the 16 bit fixed point shift
+ */
+qint16_t sshr_qs16(qint16_t a, int shift);
+
/** 16 bit fixed point scalar saturating shift left
*
* @param[in] a First 16 bit fixed point input
diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl
index b921b32ed9..5ea0f6c825 100644
--- a/arm_compute/core/FixedPoint.inl
+++ b/arm_compute/core/FixedPoint.inl
@@ -21,6 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "arm_compute/core/Error.h"
+
#include <cmath>
#include <limits>
@@ -59,6 +61,20 @@ inline qint16_t sqshl_qs16(qint16_t a, int shift)
return saturate_convert<qint32_t, qint16_t>(tmp);
}
+inline qint8_t sshr_qs8(qint8_t a, int shift)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(shift == 0, "Shift should not be zero");
+ const qint8_t round_val = 1 << (shift - 1);
+ return sqadd_qs8(a, round_val) >> shift;
+}
+
+inline qint16_t sshr_qs16(qint16_t a, int shift)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(shift == 0, "Shift should not be zero");
+ const qint16_t round_val = 1 << (shift - 1);
+ return sqadd_qs16(a, round_val) >> shift;
+}
+
inline qint8_t sabs_qs8(qint8_t a)
{
return (a < 0) ? (a == std::numeric_limits<int8_t>::min()) ? std::numeric_limits<int8_t>::max() : -a : a;
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index dd1066d6bc..a5d9e7685d 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -25,8 +25,9 @@
namespace arm_compute
{
-/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
- * Format is in Q0.7 for all elements */
+/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
+ * Format is in Q0.7 for all elements
+ */
static const std::array<qint8x8_t, 4> exp_tab_qs8 =
{
{
@@ -37,8 +38,9 @@ static const std::array<qint8x8_t, 4> exp_tab_qs8 =
}
};
-/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
- * Format is in Q0.15 for all elements */
+/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
+ * Format is in Q0.15 for all elements
+ */
static const std::array<qint16x4_t, 4> exp_tab_qs16 =
{
{
@@ -49,8 +51,9 @@ static const std::array<qint16x4_t, 4> exp_tab_qs16 =
}
};
-/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
- * Format is in Q0.7 for all elements */
+/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
+ * Format is in Q0.7 for all elements
+ */
static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
{
{
@@ -61,8 +64,9 @@ static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
}
};
-/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
- * Format is in Q0.15 for all elements */
+/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
+ * Format is in Q0.15 for all elements
+ */
static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
{
{
@@ -73,8 +77,9 @@ static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
}
};
-/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
- * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
+ * Format is in Q0.7 for all elements except the first one which is in Q1.6
+ */
static const std::array<qint8x8_t, 4> log_tab_qs8 =
{
{
@@ -85,8 +90,9 @@ static const std::array<qint8x8_t, 4> log_tab_qs8 =
}
};
-/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
- * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
+/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
+ * Format is in Q0.15 for all elements except the first one which is in Q1.14
+ */
static const std::array<qint16x4_t, 4> log_tab_qs16 =
{
{
@@ -97,8 +103,9 @@ static const std::array<qint16x4_t, 4> log_tab_qs16 =
}
};
-/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
- * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
+ * Format is in Q0.7 for all elements except the first one which is in Q1.6
+ */
static const std::array<qint8x16_t, 4> log_tabq_qs8 =
{
{
@@ -109,8 +116,9 @@ static const std::array<qint8x16_t, 4> log_tabq_qs8 =
}
};
-/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
- * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
+/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
+ * Format is in Q0.15 for all elements except the first one which is in Q1.14
+ */
static const std::array<qint16x8_t, 4> log_tabq_qs16 =
{
{
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index a5de81137b..8a938a7f34 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -48,7 +48,7 @@ public:
~NEPoolingLayerKernel() = default;
/** Set the input and output tensors.
*
- * @param[in] input Source tensor. Data types supported: QS8/F16/F32.
+ * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32.
* @param[out] output Destination tensor. Data types supported: Same as @p input.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*/
@@ -81,6 +81,13 @@ private:
*/
template <PoolingType pooling_type>
void pooling2_q8(const Window &window_input, const Window &window);
+ /** Function to perform 2x2 pooling for 16bit fixed point.
+ *
+ * @param[in] window_input Input region on which to execute the kernel.
+ * @param[in] window Output region on which to execute the kernel.
+ */
+ template <PoolingType pooling_type>
+ void pooling2_q16(const Window &window_input, const Window &window);
/** Function to perform 3x3 pooling.
*
* @param[in] window_input Input region on which to execute the kernel.
@@ -102,6 +109,13 @@ private:
*/
template <PoolingType pooling_type>
void pooling3_q8(const Window &window_input, const Window &window);
+ /** Function to perform 3x3 pooling for 16bit fixed point.
+ *
+ * @param[in] window_input Input region on which to execute the kernel.
+ * @param[in] window Output region on which to execute the kernel.
+ */
+ template <PoolingType pooling_type>
+ void pooling3_q16(const Window &window_input, const Window &window);
/** Function to perform 7x7 pooling.
*
* @param[in] window_input Input region on which to execute the kernel.
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 1c96007c9b..df56c23800 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -65,7 +65,20 @@ inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int
const int end_x = std::min(start_x + pool_size, upper_bound_w);
const int end_y = std::min(start_y + pool_size, upper_bound_h);
const int val = ((end_y - start_y) * (end_x - start_x));
- return scale_values_q8[val] >> (7 - fixed_point_position);
+ return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
+}
+
+inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
+ int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
+{
+ static std::array<qint16_t, 10> scale_values_q16 =
+ { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
+ const int start_x = id.x() * stride_x - pad_x;
+ const int start_y = id.y() * stride_y - pad_y;
+ const int end_x = std::min(start_x + pool_size, upper_bound_w);
+ const int end_y = std::min(start_y + pool_size, upper_bound_h);
+ const int val = ((end_y - start_y) * (end_x - start_x));
+ return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
}
} // namespace
@@ -97,12 +110,14 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
ARM_COMPUTE_UNUSED(supported_pool_sizes);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
- ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_type == PoolingType::AVG && input->info()->fixed_point_position() > 6);
- ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_stride_x > 2);
+ ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2);
// Check output dimensions
std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
@@ -133,16 +148,31 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
switch(pool_size)
{
case 2:
- num_elems_processed_per_iteration = 8;
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
break;
case 3:
- num_elems_processed_per_iteration = 7;
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
break;
default:
ARM_COMPUTE_ERROR("Pooling size not supported");
break;
}
- num_elems_horizontal_window = 8;
+ num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
+ break;
+ case DataType::QS16:
+ num_elems_read_per_iteration = 8;
+ switch(pool_size)
+ {
+ case 2:
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
+ break;
+ case 3:
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Pooling size not supported");
+ }
+ num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
break;
#ifdef ARM_COMPUTE_ENABLE_FP16
case DataType::F16:
@@ -210,6 +240,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
{
_func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
}
+ else if(input->info()->data_type() == DataType::QS16)
+ {
+ _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+ }
else if(input->info()->data_type() == DataType::F16)
{
_func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
@@ -224,6 +258,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
{
_func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
}
+ else if(input->info()->data_type() == DataType::QS16)
+ {
+ _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+ }
else if(input->info()->data_type() == DataType::F16)
{
_func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
@@ -274,7 +312,8 @@ void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window
{
const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
- qint8x8_t res = {};
+ qint8x8_t lower_res = {};
+ qint8x8_t upper_res = {};
if(pooling_type == PoolingType::AVG)
{
// Calculate scale
@@ -283,14 +322,96 @@ void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window
// Perform pooling
const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
- res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
+ lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
+ if(pool_stride_x == 1)
+ {
+ const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
+ upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
+ }
}
else
{
const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
- res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
+ lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
+ if(pool_stride_x == 1)
+ {
+ const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
+ upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
+ }
+ }
+ if(pool_stride_x == 1)
+ {
+ const qint8x8x2_t res = vzip_s8(lower_res, upper_res);
+ vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+ }
+ else
+ {
+ vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
+ }
+ },
+ input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int fixed_point_position = _input->info()->fixed_point_position();
+ constexpr int pool_size = 2;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+ const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
+ const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
+ qint16x4_t lower_res = {};
+ qint16x4_t upper_res = {};
+ if(pooling_type == PoolingType::AVG)
+ {
+ // Calculate scale
+ const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+ const qint16x4_t scale_vec = vdup_n_qs16(scale);
+
+ // Perform pooling
+ const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
+ lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
+ if(pool_stride_x == 1)
+ {
+ const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
+ upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
+ }
+ }
+ else
+ {
+ const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
+ lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
+ if(pool_stride_x == 1)
+ {
+ const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
+ upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
+ }
+ }
+ if(pool_stride_x == 1)
+ {
+ const qint16x4x2_t res = vzip_s16(lower_res, upper_res);
+ vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
+ }
+ else
+ {
+ vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
}
- vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
},
input, output);
}
@@ -464,8 +585,7 @@ void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window
if(pooling_type == PoolingType::AVG)
{
// Calculate scale
- const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
- const qint8x8_t scale_vec = vdup_n_qs8(scale);
+ const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
// Perform pooling for stride 2
const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
@@ -476,13 +596,16 @@ void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window
{
const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+ const qint8x8_t scale_vec = vdup_n_qs8(scale);
res = vtbl2_s8(table, lookup_val);
+ res = vqmul_qs8(res, scale_vec, fixed_point_position);
+ vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
}
else
{
- res = vget_low_s8(final_sum);
+ const qint8x16_t scale_vec = vdupq_n_qs8(scale);
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
}
- res = vqmul_qs8(res, scale_vec, fixed_point_position);
}
else
{
@@ -496,13 +619,83 @@ void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window
const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
res = vtbl2_s8(table, lookup_val);
+ vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+ }
+ else
+ {
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
+ }
+ }
+ },
+ input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int fixed_point_position = _input->info()->fixed_point_position();
+ constexpr int pool_size = 3;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+ const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+ const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
+ const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
+ const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
+
+ if(pooling_type == PoolingType::AVG)
+ {
+ // Calculate scale
+ const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+
+ // Perform pooling for stride 2
+ const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
+ const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
+ const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
+ const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
+ if(pool_stride_x == 2)
+ {
+ const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
+ const qint16x4_t scale_vec = vdup_n_qs16(scale);
+ vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
+ }
+ else
+ {
+ const qint16x8_t scale_vec = vdupq_n_qs16(scale);
+ vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
+ }
+ }
+ else
+ {
+ const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
+ const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
+ const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
+ const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
+
+ if(pool_stride_x == 2)
+ {
+ const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
+ vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
}
else
{
- res = vget_low_s8(final_max);
+ vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
}
}
- vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
},
input, output);
}
@@ -630,6 +823,7 @@ void NEPoolingLayerKernel::run(const Window &window)
switch(_input->info()->data_type())
{
case DataType::QS8:
+ case DataType::QS16:
case DataType::F16:
{
window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
diff --git a/tests/Utils.h b/tests/Utils.h
index 389c9806bb..06d27b87fa 100644
--- a/tests/Utils.h
+++ b/tests/Utils.h
@@ -198,8 +198,11 @@ inline ValidRegion shape_to_valid_region(TensorShape shape, bool border_undefine
ARM_COMPUTE_ERROR_ON(shape.num_dimensions() < 2);
anchor.set(0, border_size.left);
anchor.set(1, border_size.top);
- shape.set(0, shape.x() - border_size.left - border_size.right);
- shape.set(1, shape.y() - border_size.top - border_size.bottom);
+ const int x_dim_shape = shape.x() - border_size.left - border_size.right;
+ const int y_dim_shape = shape.y() - border_size.top - border_size.bottom;
+ ARM_COMPUTE_ERROR_ON(x_dim_shape < 0 || y_dim_shape < 0);
+ shape.set(0, x_dim_shape);
+ shape.set(1, y_dim_shape);
}
return ValidRegion(std::move(anchor), std::move(shape));
}
diff --git a/tests/dataset/PoolingLayerDataset.h b/tests/dataset/PoolingLayerDataset.h
index 5cdece4f66..1496cad379 100644
--- a/tests/dataset/PoolingLayerDataset.h
+++ b/tests/dataset/PoolingLayerDataset.h
@@ -134,7 +134,7 @@ public:
~GoogLeNetPoolingLayerDataset() = default;
};
-class RandomPoolingLayerDataset final : public PoolingLayerDataset<8>
+class RandomPoolingLayerDataset final : public PoolingLayerDataset<10>
{
public:
RandomPoolingLayerDataset()
@@ -148,6 +148,8 @@ public:
PoolingLayerDataObject{ TensorShape(13U, 13U, 32U), TensorShape(6U, 6U, 32U), PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(2, 2, 0, 0)) },
PoolingLayerDataObject{ TensorShape(24U, 24U, 10U), TensorShape(12U, 12U, 10U), PoolingLayerInfo(PoolingType::AVG, 2, PadStrideInfo(2, 2, 0, 0)) },
PoolingLayerDataObject{ TensorShape(8U, 8U, 30U), TensorShape(4U, 4U, 30U), PoolingLayerInfo(PoolingType::AVG, 2, PadStrideInfo(2, 2, 0, 0)) },
+ PoolingLayerDataObject{ TensorShape(7U, 7U, 10U), TensorShape(7U, 7U, 10U), PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 1, 1)) },
+ PoolingLayerDataObject{ TensorShape(7U, 7U, 10U), TensorShape(7U, 7U, 10U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(1, 1, 1, 1)) },
}
{
}
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 0d2f285dff..8b4ff18f8c 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -81,6 +81,7 @@ Tensor compute_pooling_layer(const TensorShape &shape_in, const TensorShape &sha
max = 1;
break;
case DataType::QS8:
+ case DataType::QS16:
min = -(1 << fixed_point_position);
max = (1 << fixed_point_position);
break;
@@ -168,6 +169,7 @@ BOOST_AUTO_TEST_SUITE_END()
#endif /* ARM_COMPUTE_ENABLE_FP16 */
BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_AUTO_TEST_SUITE(QS8)
BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
BOOST_DATA_TEST_CASE(RandomDataset,
RandomPoolingLayerDataset() * boost::unit_test::data::make(DataType::QS8) * boost::unit_test::data::xrange(1, 5),
@@ -184,6 +186,24 @@ BOOST_DATA_TEST_CASE(RandomDataset,
}
BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE(QS16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RandomDataset,
+ RandomPoolingLayerDataset() * boost::unit_test::data::make(DataType::QS16) * boost::unit_test::data::xrange(1, 13),
+ obj, dt, fixed_point_position)
+{
+ // Compute function
+ Tensor dst = compute_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info, fixed_point_position);
+
+ // Compute reference
+ RawTensor ref_dst = Reference::compute_reference_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info, fixed_point_position);
+
+ // Validate output
+ validate(Accessor(dst), ref_dst, tolerance_q, 0);
+}
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+
BOOST_AUTO_TEST_SUITE_END()
BOOST_AUTO_TEST_SUITE_END()
#endif /* DOXYGEN_SKIP_THIS */
diff --git a/tests/validation/Reference.cpp b/tests/validation/Reference.cpp
index 0fca661dc4..9cdd2d74d4 100644
--- a/tests/validation/Reference.cpp
+++ b/tests/validation/Reference.cpp
@@ -525,6 +525,7 @@ RawTensor Reference::compute_reference_pooling_layer(const TensorShape &shape_in
max = 1;
break;
case DataType::QS8:
+ case DataType::QS16:
min = -(1 << fixed_point_position);
max = (1 << fixed_point_position);
break;
@@ -535,7 +536,7 @@ RawTensor Reference::compute_reference_pooling_layer(const TensorShape &shape_in
library->fill(ref_src, distribution, 0.0);
// Compute reference
- ReferenceCPP::pooling_layer(ref_src, ref_dst, pool_info, fixed_point_position);
+ ReferenceCPP::pooling_layer(ref_src, ref_dst, pool_info);
return ref_dst;
}
diff --git a/tests/validation/ReferenceCPP.cpp b/tests/validation/ReferenceCPP.cpp
index 069cc1d871..4a2d7bebba 100644
--- a/tests/validation/ReferenceCPP.cpp
+++ b/tests/validation/ReferenceCPP.cpp
@@ -286,11 +286,11 @@ void ReferenceCPP::fully_connected_layer(const RawTensor &src, const RawTensor &
}
// Pooling Layer
-void ReferenceCPP::pooling_layer(const RawTensor &src, RawTensor &dst, PoolingLayerInfo pool_info, int fixed_point_position)
+void ReferenceCPP::pooling_layer(const RawTensor &src, RawTensor &dst, PoolingLayerInfo pool_info)
{
const TensorVariant s = TensorFactory::get_tensor(src);
TensorVariant d = TensorFactory::get_tensor(dst);
- boost::apply_visitor(tensor_visitors::pooling_layer_visitor(s, pool_info, fixed_point_position), d);
+ boost::apply_visitor(tensor_visitors::pooling_layer_visitor(s, pool_info), d);
}
// ROI Pooling Layer
diff --git a/tests/validation/ReferenceCPP.h b/tests/validation/ReferenceCPP.h
index 2d35fa9590..cc886aefc9 100644
--- a/tests/validation/ReferenceCPP.h
+++ b/tests/validation/ReferenceCPP.h
@@ -263,12 +263,11 @@ public:
static void fully_connected_layer(const RawTensor &src, const RawTensor &weights, const RawTensor &bias, RawTensor &dst);
/** Pooling layer of @p src based on the information from @p pool_info.
*
- * @param[in] src Input tensor.
- * @param[out] dst Result tensor.
- * @param[in] pool_info Pooling Layer information.
- * @param[in] fixed_point_position Fixed point position. (Optional)
+ * @param[in] src Input tensor.
+ * @param[out] dst Result tensor.
+ * @param[in] pool_info Pooling Layer information.
*/
- static void pooling_layer(const RawTensor &src, RawTensor &dst, PoolingLayerInfo pool_info, int fixed_point_position = 0);
+ static void pooling_layer(const RawTensor &src, RawTensor &dst, PoolingLayerInfo pool_info);
/** ROI Pooling layer of @p src based on the information from @p pool_info and @p rois.
*
* @param[in] src Input tensor.
diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h
index 3220d80a04..887d52887d 100644
--- a/tests/validation/TensorOperations.h
+++ b/tests/validation/TensorOperations.h
@@ -24,7 +24,6 @@
#ifndef __ARM_COMPUTE_TEST_TENSOR_OPERATIONS_H__
#define __ARM_COMPUTE_TEST_TENSOR_OPERATIONS_H__
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Types.h"
#include "support/ToolchainSupport.h"
#include "tests/Types.h"
@@ -961,8 +960,8 @@ void fully_connected_layer(const Tensor<T> &in, const Tensor<T> &weights, const
}
// Pooling layer
-template <typename T>
-void pooling_layer(const Tensor<T> &in, Tensor<T> &out, PoolingLayerInfo pool_info, int fixed_point_position)
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type * = nullptr>
+void pooling_layer(const Tensor<T> &in, Tensor<T> &out, PoolingLayerInfo pool_info)
{
const int pool_size = pool_info.pool_size();
PoolingType type = pool_info.pool_type();
@@ -1054,38 +1053,136 @@ void pooling_layer(const Tensor<T> &in, Tensor<T> &out, PoolingLayerInfo pool_in
hstart = std::max(hstart, 0);
wend = std::min(wend, w_in);
hend = std::min(hend, h_in);
- if(is_floating_point<T>::value)
+
+ for(int y = hstart; y < hend; ++y)
{
- for(int y = hstart; y < hend; ++y)
+ for(int x = wstart; x < wend; ++x)
{
- for(int x = wstart; x < wend; ++x)
- {
- avg_val += in[r * h_in * w_in + y * w_in + x];
- }
+ avg_val += in[r * h_in * w_in + y * w_in + x];
}
- out[r * h_out * w_out + h * pooled_w + w] = avg_val / pool;
}
- else
- {
- static std::array<qint8_t, 10> scale_values_q8 =
- { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
+ out[r * h_out * w_out + h * pooled_w + w] = avg_val / pool;
+ }
+ }
+ }
+ }
+}
- for(int y = hstart; y < hend; ++y)
+// Pooling layer
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type * = nullptr>
+void pooling_layer(const Tensor<T> &in, Tensor<T> &out, PoolingLayerInfo pool_info)
+{
+ const int pool_size = pool_info.pool_size();
+ PoolingType type = pool_info.pool_type();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ int pad_x = 0;
+ int pad_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info().stride();
+ std::tie(pad_x, pad_y) = pool_info.pad_stride_info().pad();
+
+ const int w_in = static_cast<int>(in.shape()[0]);
+ const int h_in = static_cast<int>(in.shape()[1]);
+
+ const int w_out = static_cast<int>(out.shape()[0]);
+ const int h_out = static_cast<int>(out.shape()[1]);
+
+ int upper_dims = in.shape().total_size() / (w_in * h_in);
+
+ int pooled_w = 0;
+ int pooled_h = 0;
+ if(pool_info.pad_stride_info().round() == DimensionRoundingType::CEIL)
+ {
+ pooled_w = static_cast<int>(ceil(static_cast<float>(w_in + 2 * pad_x - pool_size) / pool_stride_x)) + 1;
+ pooled_h = static_cast<int>(ceil(static_cast<float>(h_in + 2 * pad_y - pool_size) / pool_stride_y)) + 1;
+ }
+ else
+ {
+ pooled_w = static_cast<int>(floor(static_cast<float>(w_in + 2 * pad_x - pool_size) / pool_stride_x)) + 1;
+ pooled_h = static_cast<int>(floor(static_cast<float>(h_in + 2 * pad_y - pool_size) / pool_stride_y)) + 1;
+ }
+
+ if((pooled_w - 1) * pool_stride_x >= w_in + pad_x)
+ {
+ --pooled_w;
+ }
+ if((pooled_h - 1) * pool_stride_y >= h_in + pad_y)
+ {
+ --pooled_h;
+ }
+
+ if(type == PoolingType::MAX)
+ {
+ for(int r = 0; r < upper_dims; ++r)
+ {
+ for(int h = 0; h < pooled_h; ++h)
+ {
+ for(int w = 0; w < pooled_w; ++w)
+ {
+ int wstart = w * pool_stride_x - pad_x;
+ int hstart = h * pool_stride_y - pad_y;
+ int wend = std::min(wstart + pool_size, w_in);
+ int hend = std::min(hstart + pool_size, h_in);
+ wstart = std::max(wstart, 0);
+ hstart = std::max(hstart, 0);
+
+ T max_val = std::numeric_limits<T>::lowest();
+ for(int y = hstart; y < hend; ++y)
+ {
+ for(int x = wstart; x < wend; ++x)
{
- for(int x = wstart; x < wend; ++x)
+ T val = in[r * h_in * w_in + y * w_in + x];
+ if(val > max_val)
{
- avg_val = sqadd_qs8(avg_val, in[r * h_in * w_in + y * w_in + x]);
+ max_val = val;
}
}
- out[r * h_out * w_out + h * pooled_w + w] = sqmul_qs8(avg_val, (scale_values_q8[pool] >> (7 - fixed_point_position)), fixed_point_position);
}
+
+ out[r * h_out * w_out + h * pooled_w + w] = max_val;
+ }
+ }
+ }
+ }
+ else // Average pooling
+ {
+ for(int r = 0; r < upper_dims; ++r)
+ {
+ for(int h = 0; h < pooled_h; ++h)
+ {
+ for(int w = 0; w < pooled_w; ++w)
+ {
+ int wstart = w * pool_stride_x - pad_x;
+ int hstart = h * pool_stride_y - pad_y;
+ int wend = std::min(wstart + pool_size, w_in + pad_x);
+ int hend = std::min(hstart + pool_size, h_in + pad_y);
+ int pool = (hend - hstart) * (wend - wstart);
+ wstart = std::max(wstart, 0);
+ hstart = std::max(hstart, 0);
+ wend = std::min(wend, w_in);
+ hend = std::min(hend, h_in);
+
+ using namespace fixed_point_arithmetic;
+
+ const int fixed_point_position = in.fixed_point_position();
+ const fixed_point<T> invpool_fp(1.f / static_cast<float>(pool), fixed_point_position);
+ fixed_point<T> avg_val(0, fixed_point_position, true);
+ for(int y = hstart; y < hend; ++y)
+ {
+ for(int x = wstart; x < wend; ++x)
+ {
+ const fixed_point<T> in_fp(in[r * h_in * w_in + y * w_in + x], fixed_point_position, true);
+ avg_val = add(avg_val, in_fp);
+ }
+ }
+ out[r * h_out * w_out + h * pooled_w + w] = mul(avg_val, invpool_fp).raw();
}
}
}
}
}
-// Pooling layer
+// ROI Pooling layer
template <typename T>
void roi_pooling_layer(const Tensor<T> &in, Tensor<T> &out, const std::vector<ROI> &rois, const ROIPoolingLayerInfo &pool_info)
{
diff --git a/tests/validation/TensorVisitors.h b/tests/validation/TensorVisitors.h
index 5ee7ae3a9f..193697acf0 100644
--- a/tests/validation/TensorVisitors.h
+++ b/tests/validation/TensorVisitors.h
@@ -27,6 +27,7 @@
#include "Tensor.h"
#include "TensorOperations.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/Lut.h"
#include "boost_wrapper.h"
@@ -258,8 +259,8 @@ private:
struct pooling_layer_visitor : public boost::static_visitor<>
{
public:
- explicit pooling_layer_visitor(const TensorVariant &in, PoolingLayerInfo pool_info, int fixed_point_position = 0)
- : _in(in), _pool_info(pool_info), _fixed_point_position(fixed_point_position)
+ explicit pooling_layer_visitor(const TensorVariant &in, PoolingLayerInfo pool_info)
+ : _in(in), _pool_info(pool_info)
{
}
@@ -267,13 +268,12 @@ public:
void operator()(Tensor<T> &out) const
{
const Tensor<T> &in = boost::get<Tensor<T>>(_in);
- tensor_operations::pooling_layer(in, out, _pool_info, _fixed_point_position);
+ tensor_operations::pooling_layer(in, out, _pool_info);
}
private:
const TensorVariant &_in;
PoolingLayerInfo _pool_info;
- int _fixed_point_position;
};
// ROI Pooling layer