aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2020-06-03 20:16:46 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2020-06-09 11:58:01 +0000
commit980a9168b81d778f4902973b4920b54c103907e0 (patch)
treed2e8bf3527db8fe39cec8c51c6a914b721c35b03
parent2d10f186aacfc56b601b3cdaffa942cc6e6d1f53 (diff)
downloadComputeLibrary-980a9168b81d778f4902973b4920b54c103907e0.tar.gz
COMPMID-3177: Remove padding from NEBatchNormalizationLayer
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I9be23e6ef1f552eb159e39fda16c82fa20124094 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3307 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h29
-rw-r--r--arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h108
-rw-r--r--src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp372
-rw-r--r--tests/validation/NEON/BatchNormalizationLayer.cpp43
4 files changed, 266 insertions, 286 deletions
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index d59ed7baf0..7371e3c177 100644
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,7 @@
namespace arm_compute
{
+// Forward declarations
class ITensor;
/** Interface for the batch normalization layer kernel.
@@ -97,40 +98,26 @@ private:
/** Configure execution function in case of fused activation **/
void configure_fused();
- /** Template function to run batch normalization on fp16
- *
- * @tparam fused_activation Boolean that flags if its a fused activation or not
- *
- * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
- */
- template <bool fused_activation, typename F>
- void batch_normalization_fp16_nchw(const Window &window);
- /** Template function to run batch normalization on fp16 on tensors with NHWC format
- *
- * @tparam fused_activation Boolean that flags if its a fused activation or not
- *
- * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
- */
- template <bool fused_activation, typename F>
- void batch_normalization_fp16_nhwc(const Window &window);
/** Template function to run batch normalization on fp32
*
+ * @tparam T Specialization data type
* @tparam fused_activation Boolean that flags if its a fused activation or not
* @tparam F Activation function functor to run
*
* @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
*/
- template <bool fused_activation, typename F>
- void batch_normalization_fp32_nchw(const Window &window);
+ template <typename T, bool fused_activation, typename F>
+ void batch_normalization_nchw(const Window &window);
/** Template function to run batch normalization on fp32 on tensors with NHWC format
*
+ * @tparam T Specialization data type
* @tparam fused_activation Boolean that flags if its a fused activation or not
* @tparam F Activation function functor to run
*
* @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
*/
- template <bool fused_activation, typename F>
- void batch_normalization_fp32_nhwc(const Window &window);
+ template <typename T, bool fused_activation, typename F>
+ void batch_normalization_nhwc(const Window &window);
/** Common signature for all the batch normalization functions
*
* @param[in] window Region on which to execute the kernel.
diff --git a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index 4861559695..7945418ac5 100644
--- a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,6 +45,7 @@ struct dummy
{
ARM_COMPUTE_UNUSED(act_info);
}
+
/** Run activation function.
*
* @param[in] vval Vector of values.
@@ -53,6 +54,15 @@ struct dummy
{
ARM_COMPUTE_UNUSED(vval);
}
+
+ /** Run activation function.
+ *
+ * @param[in] val Scalar value.
+ */
+ void operator()(T &val)
+ {
+ ARM_COMPUTE_UNUSED(val);
+ }
};
/** Linear activation object */
template <typename T, int S>
@@ -68,8 +78,10 @@ struct linear
* @param[in] act_info Activation layer information.
*/
explicit linear(ActivationLayerInfo act_info)
- : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
- vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
+ : alpha(act_info.a()),
+ beta(act_info.b()),
+ valpha(wrapper::vdup_n(static_cast<T>(alpha), ExactTagType{})),
+ vbeta(wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))
{
}
@@ -79,13 +91,22 @@ struct linear
*/
void operator()(ExactType &vval)
{
- vval = wrapper::vmla(vval, valpha, vbeta);
+ vval = wrapper::vmla(vbeta, vval, valpha);
}
- /** Vector of alphas. */
- const ExactType valpha;
- /** Vector of betas. */
- const ExactType vbeta;
+ /** Run activation function.
+ *
+ * @param[in] val Scalar value.
+ */
+ void operator()(T &val)
+ {
+ val = alpha * val + beta;
+ }
+
+ const T alpha; /**< Scalar alpha */
+ const T beta; /**< Scalar alpha */
+ const ExactType valpha; /**< Vector of alphas. */
+ const ExactType vbeta; /**< Vector of betas. */
};
/** Square activation object */
template <typename T, int S>
@@ -113,6 +134,15 @@ struct square
{
vval = wrapper::vmul(vval, vval);
}
+
+ /** Run activation function.
+ *
+ * @param[in] val Scalar value.
+ */
+ void operator()(T &val)
+ {
+ val = val * val;
+ }
};
/** Logistic activation object */
template <typename T, int S>
@@ -128,7 +158,7 @@ struct logistic
* @param[in] act_info Activation layer information.
*/
explicit logistic(ActivationLayerInfo act_info)
- : vone(wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}))
+ : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
{
ARM_COMPUTE_UNUSED(act_info);
}
@@ -142,6 +172,15 @@ struct logistic
vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vneg(vval))));
}
+ /** Run activation function.
+ *
+ * @param[in] val Scalar value.
+ */
+ void operator()(T &val)
+ {
+ val = 1 / (1 + std::exp(-val));
+ }
+
/** Vector of ones. */
const ExactType vone;
};
@@ -159,7 +198,7 @@ struct relu
* @param[in] act_info Activation layer information.
*/
explicit relu(ActivationLayerInfo act_info)
- : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}))
+ : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
{
ARM_COMPUTE_UNUSED(act_info);
}
@@ -173,6 +212,15 @@ struct relu
vval = wrapper::vmax(vzero, vval);
}
+ /** Run activation function.
+ *
+ * @param[in] val Scalar value.
+ */
+ void operator()(T &val)
+ {
+ val = std::max(static_cast<T>(0), val);
+ }
+
/** Vector of zeroes. */
const ExactType vzero;
};
@@ -190,7 +238,8 @@ struct brelu
* @param[in] act_info Activation layer information.
*/
explicit brelu(ActivationLayerInfo act_info)
- : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{})),
+ : alpha(act_info.a()),
+ vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{})),
valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{}))
{
}
@@ -204,10 +253,18 @@ struct brelu
vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval));
}
- /** Vector of zeroes. */
- const ExactType vzero;
- /** Vector of alphas. */
- const ExactType valpha;
+ /** Run activation function.
+ *
+ * @param[in] val Scalar value.
+ */
+ void operator()(T &val)
+ {
+ val = std::min(alpha, std::max(static_cast<T>(0), val));
+ }
+
+ const T alpha; /** Scalar alpha */
+ const ExactType vzero; /** Vector of zeroes. */
+ const ExactType valpha; /** Vector of alphas. */
};
/** Lower-Upper Bounded RELU activation object */
template <typename T, int S>
@@ -223,7 +280,9 @@ struct lubrelu
* @param[in] act_info Activation layer information.
*/
explicit lubrelu(ActivationLayerInfo act_info)
- : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
+ : alpha(act_info.a()),
+ beta(act_info.b()),
+ valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
{
}
@@ -237,10 +296,19 @@ struct lubrelu
vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval));
}
- /** Vector of alphas. */
- const ExactType valpha;
- /** Vector of betas. */
- const ExactType vbeta;
+ /** Run activation function.
+ *
+ * @param[in] val Scalar value.
+ */
+ void operator()(T &val)
+ {
+ val = std::min(alpha, std::max(beta, val));
+ }
+
+ const T alpha; /**< Scalar alpha */
+ const T beta; /**< Scalar alpha */
+ const ExactType valpha; /** Vector of alphas. */
+ const ExactType vbeta; /** Vector of betas. */
};
} // namespace detail
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 6bd30ee845..3d84ce8449 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,10 +33,12 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include <map>
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
-using namespace arm_compute;
+#include <map>
+namespace arm_compute
+{
namespace
{
Status
@@ -82,56 +84,41 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *var, ITensorInfo *gamma, ITensorInfo *beta)
{
- if(output != nullptr)
- {
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, *input->clone());
- }
+ ARM_COMPUTE_UNUSED(mean, var, gamma, beta);
- unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access);
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps());
if(output != nullptr)
{
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed |= update_window_and_padding(win, output_access);
- output_access.set_valid_region(win, input->valid_region());
- }
-
- // Mean, var, gamma and beta get parallelized for the NHWC case as they follow the channel dimension, which is along the first axis
- if(input->data_layout() == DataLayout::NHWC)
- {
- AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal var_access(var, 0, num_elems_processed_per_iteration);
- window_changed |= update_window_and_padding(win, mean_access, var_access);
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
- if(gamma != nullptr)
- {
- AccessWindowHorizontal gamma_access(gamma, 0, num_elems_processed_per_iteration);
- window_changed |= update_window_and_padding(win, gamma_access);
- }
- if(beta != nullptr)
- {
- AccessWindowHorizontal beta_access(beta, 0, num_elems_processed_per_iteration);
- window_changed |= update_window_and_padding(win, beta_access);
- }
+ // NEBatchNormalizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
}
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(Status{}, win);
}
} //namespace
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window)
+template <typename T, bool fused_activation, typename F>
+void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &window)
{
- ARM_COMPUTE_UNUSED(window);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- Iterator input(_input, window);
- Iterator output(_output, window);
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+ const int window_step_x = 16 / sizeof(T);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ Window win_to_use = window;
+ win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(_input, win_to_use);
+ Iterator output(_output, win_to_use);
F activation_functor(_act_info);
@@ -139,196 +126,168 @@ void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window
// Only compute denominator and NEON vectors once per feature map.
int slice = -1;
- const auto input_mean = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
- float16x8_t mean_vec = vdupq_n_f16(0.0);
- float16x8_t var_vec = vdupq_n_f16(0.0);
- float16x8_t gamma_vec = vdupq_n_f16(1.0);
- float16x8_t beta_vec = vdupq_n_f16(0.0);
- float16x8_t denominator = vdupq_n_f16(0.0);
- const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
- execute_window_loop(window, [&](const Coordinates & id)
+ const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+
+ T mean = static_cast<T>(0);
+ T var = static_cast<T>(0);
+ T gamma = static_cast<T>(1);
+ T beta = static_cast<T>(0);
+ T denominator = static_cast<T>(0);
+
+ auto mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+ auto var_vec = wrapper::vdup_n(var, ExactTagType{});
+ auto gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+ auto beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+ auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
+ const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
+ execute_window_loop(win_to_use, [&](const Coordinates & id)
{
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
if(slice != id.z())
{
- // Conctruct vectors
- mean_vec = vdupq_n_f16(*(input_mean + id.z()));
- var_vec = vdupq_n_f16(*(input_var + id.z()));
+ mean = input_mean[id.z()];
+ var = input_var[id.z()];
+ mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+ var_vec = wrapper::vdup_n(var, ExactTagType{});
if(input_gamma != nullptr)
{
- gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
+ gamma = input_gamma[id.z()];
+ gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
}
if(input_beta != nullptr)
{
- beta_vec = vdupq_n_f16(*(input_beta + id.z()));
+ beta = input_beta[id.z()];
+ beta_vec = wrapper::vdup_n(beta, ExactTagType{});
}
// Calculate denominator
- denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
- slice = id.z();
+ denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+ denominator = wrapper::vgetlane(denominator_vec, 0);
+ slice = id.z();
}
- // Calculate x bar and store results
- const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
- const float16x8_t x_bar = vmulq_f16(numerator, denominator);
- float16x8_t res = vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec));
-
- // Perform fused activation
- if(fused_activation)
+ // Perform core calculations using vector operations
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
{
- activation_functor(res);
- }
+ // Calculate x bar
+ const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+ const auto x_bar = wrapper::vmul(numerator, denominator_vec);
+ auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
- vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
- },
- input, output);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-}
+ // Perform fused activation
+ if(fused_activation)
+ {
+ activation_functor(res);
+ }
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc(const Window &window)
-{
- ARM_COMPUTE_UNUSED(window);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- Iterator input(_input, window);
- Iterator output(_output, window);
+ // Store results
+ wrapper::vstore(output_ptr + x, res);
+ }
- F activation_functor(_act_info);
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ const T numerator = input_ptr[x] - mean;
+ const T x_bar = numerator * denominator;
+ T res = beta + x_bar * gamma;
- const auto input_mean = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ // Perform fused activation
+ if(fused_activation)
+ {
+ activation_functor(res);
+ }
- const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Conctruct vectors
- const float16x8_t mean_vec = vld1q_f16(input_mean + id.x());
- const float16x8_t var_vec = vld1q_f16(input_var + id.x());
- const float16x8_t gamma_vec = (input_gamma != nullptr) ? vld1q_f16(input_gamma + id.x()) : vdupq_n_f16(1.0);
- const float16x8_t beta_vec = (input_beta != nullptr) ? vld1q_f16(input_beta + id.x()) : vdupq_n_f16(0.0);
- // Calculate denominator
- const float16x8_t denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
-
- // Calculate x bar and store results
- const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
- const float16x8_t x_bar = vmulq_f16(numerator, denominator);
- float16x8_t res = vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec));
-
- // Perform fused activation
- if(fused_activation)
- {
- activation_functor(res);
+ // Store results
+ *(output_ptr + x) = res;
}
-
- vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
},
input, output);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw(const Window &window)
+template <typename T, bool fused_activation, typename F>
+void NEBatchNormalizationLayerKernel::batch_normalization_nhwc(const Window &window)
{
- Iterator input(_input, window);
- Iterator output(_output, window);
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+ const int window_step_x = 16 / sizeof(T);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(_input, win_collapsed);
+ Iterator output(_output, win_collapsed);
F activation_functor(_act_info);
- // Hold information about the current feature map we are iterating.
- // Only compute denominator and NEON vectors once per feature map.
- int slice = -1;
+ const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_mean = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
- float32x4_t mean_vec = vdupq_n_f32(0.0);
- float32x4_t var_vec = vdupq_n_f32(0.0);
- float32x4_t gamma_vec = vdupq_n_f32(1.0);
- float32x4_t beta_vec = vdupq_n_f32(0.0);
- float32x4_t denominator = vdupq_n_f32(0.0);
- const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
- execute_window_loop(window, [&](const Coordinates & id)
+ const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
{
- if(slice != id.z())
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+ // Perform core calculations using vector operations
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
{
// Conctruct vectors
- mean_vec = vdupq_n_f32(*(input_mean + id.z()));
- var_vec = vdupq_n_f32(*(input_var + id.z()));
- if(input_gamma != nullptr)
- {
- gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
- }
- if(input_beta != nullptr)
- {
- beta_vec = vdupq_n_f32(*(input_beta + id.z()));
- }
+ const auto mean_vec = wrapper::vloadq(input_mean + x);
+ const auto var_vec = wrapper::vloadq(input_var + x);
+ const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+ const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
// Calculate denominator
- denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
- slice = id.z();
- }
+ const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
- // Calculate x bar
- const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
- const float32x4_t x_bar = vmulq_f32(numerator, denominator);
- float32x4_t res = vmlaq_f32(beta_vec, x_bar, gamma_vec);
+ // Calculate x bar
+ const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+ const auto x_bar = wrapper::vmul(numerator, denominator);
+ auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
- // Perform fused activation
- if(fused_activation)
- {
- activation_functor(res);
- }
+ // Perform fused activation
+ if(fused_activation)
+ {
+ activation_functor(res);
+ }
- // Store results
- vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
- },
- input, output);
-}
+ // Store results
+ wrapper::vstore(output_ptr + x, res);
+ }
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc(const Window &window)
-{
- Iterator input(_input, window);
- Iterator output(_output, window);
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ // Conctruct vectors
+ const T gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+ const T beta = (input_beta != nullptr) ? input_beta[x] : 0.f;
- F activation_functor(_act_info);
+ const T denominator = sqrt(input_var[x] + _epsilon);
+ const T numerator = input_ptr[x] - input_mean[x];
+ const T x_bar = numerator / denominator;
+ T res = beta + x_bar * gamma;
- const auto input_mean = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ // Perform fused activation
+ if(fused_activation)
+ {
+ activation_functor(res);
+ }
- const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Conctruct vectors
- const float32x4_t mean_vec = vld1q_f32(input_mean + id.x());
- const float32x4_t var_vec = vld1q_f32(input_var + id.x());
- const float32x4_t gamma_vec = (input_gamma != nullptr) ? vld1q_f32(input_gamma + id.x()) : vdupq_n_f32(1.0);
- const float32x4_t beta_vec = (input_beta != nullptr) ? vld1q_f32(input_beta + id.x()) : vdupq_n_f32(0.0);
- // Calculate denominator
- const float32x4_t denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
-
- // Calculate x bar
- const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
- const float32x4_t x_bar = vmulq_f32(numerator, denominator);
- float32x4_t res = vmlaq_f32(beta_vec, x_bar, gamma_vec);
-
- // Perform fused activation
- if(fused_activation)
- {
- activation_functor(res);
+ // Store results
+ *reinterpret_cast<T *>(output_ptr + x) = res;
}
-
- // Store results
- vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
},
input, output);
}
@@ -340,13 +299,13 @@ void NEBatchNormalizationLayerKernel::configure_non_fused()
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false, ::detail::dummy<float16_t, 8>> :
- &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false, ::detail::dummy<float16_t, 8>>;
+ _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, false, detail::dummy<float16_t, 8>> :
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
break;
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<false, ::detail::dummy<float, 4>> :
- &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<false, ::detail::dummy<float, 4>>;
+ _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, false, detail::dummy<float, 4>> :
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, false, detail::dummy<float, 4>>;
break;
default:
ARM_COMPUTE_ERROR("Element size not supported");
@@ -359,31 +318,31 @@ void NEBatchNormalizationLayerKernel::configure_fused()
// NCHW Fused Batched Normalization with activation functions : FP32
static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
{
- { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::relu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::brelu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::lubrelu<float, 4>> }
+ { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
+ { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
+ { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
};
// NHWC Fused Batched Normalization with activation functions : FP32
static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nhwc =
{
- { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::relu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::brelu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::lubrelu<float, 4>> }
+ { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::relu<float, 4>> },
+ { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::brelu<float, 4>> },
+ { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::lubrelu<float, 4>> }
};
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
// NCHW Fused Batched Normalization with activation functions : FP16
static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
{
- { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::relu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::brelu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::lubrelu<float16_t, 8>> }
+ { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
+ { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
+ { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
};
// NHWC Fused Batched Normalization with activation functions : FP16
static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nhwc =
{
- { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::relu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::brelu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::lubrelu<float16_t, 8>> }
+ { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::relu<float16_t, 8>> },
+ { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::brelu<float16_t, 8>> },
+ { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::lubrelu<float16_t, 8>> }
};
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -475,3 +434,4 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo
(this->*_func)(window);
}
+} // namespace arm_compute
diff --git a/tests/validation/NEON/BatchNormalizationLayer.cpp b/tests/validation/NEON/BatchNormalizationLayer.cpp
index 58b7474b41..6075e6be8d 100644
--- a/tests/validation/NEON/BatchNormalizationLayer.cpp
+++ b/tests/validation/NEON/BatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -71,69 +71,34 @@ TEST_SUITE(BatchNormalizationLayer)
template <typename T>
using NEBatchNormalizationLayerFixture = BatchNormalizationLayerValidationFixture<Tensor, Accessor, NEBatchNormalizationLayer, T>;
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(),
- combine(framework::dataset::make("UseBeta", { false, true }), framework::dataset::make("UseGamma", { false, true }))),
- framework::dataset::make("DataType", { DataType::F32 })),
- framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
- shape0, shape1, epsilon, use_beta, use_gamma, dt, data_layout)
-{
- TensorShape src_dst_shapes = shape0;
- if(data_layout == DataLayout::NHWC)
- {
- permute(src_dst_shapes, PermutationVector(2U, 0U, 1U));
- }
-
- // Create tensors
- Tensor src = create_tensor<Tensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
- Tensor dst = create_tensor<Tensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
- Tensor mean = create_tensor<Tensor>(shape1, dt, 1);
- Tensor var = create_tensor<Tensor>(shape1, dt, 1);
- Tensor beta = create_tensor<Tensor>(shape1, dt, 1);
- Tensor gamma = create_tensor<Tensor>(shape1, dt, 1);
-
- // Create and Configure function
- NEBatchNormalizationLayer norm;
- Tensor *beta_ptr = use_beta ? &beta : nullptr;
- Tensor *gamma_ptr = use_gamma ? &gamma : nullptr;
- norm.configure(&src, &dst, &mean, &var, beta_ptr, gamma_ptr, epsilon);
-
- // Validate valid region
- const ValidRegion valid_region = shape_to_valid_region(src_dst_shapes);
- validate(dst.info()->valid_region(), valid_region);
-}
-
// *INDENT-OFF*
// clang-format off
DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
- framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
- TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Window shrink
+ framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Mismatching data types
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Mismatching data types
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Invalid mean/var/beta/gamma shape
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Fused activation's a < b
}),
- framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
- TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+ framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
})),
framework::dataset::make("MVBGInfo",{ TensorInfo(TensorShape(2U), 1, DataType::F32),
- TensorInfo(TensorShape(2U), 1, DataType::F32),
TensorInfo(TensorShape(2U), 1, DataType::F16),
TensorInfo(TensorShape(2U), 1, DataType::F32),
TensorInfo(TensorShape(5U), 1, DataType::F32),
TensorInfo(TensorShape(2U), 1, DataType::F32),
})),
framework::dataset::make("ActivationLayerInfo",{ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f),
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f),
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f),
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 2.f, 6.f),
})),
- framework::dataset::make("Expected", { true, false, false, false, false, false})),
+ framework::dataset::make("Expected", { true, false, false, false, false})),
input_info, output_info, mvbg_info, act_info, expected)
{
const auto &mean_info = mvbg_info;