From e9fd8b4f14f64aa23ec8554b619a4aa49d5e3183 Mon Sep 17 00:00:00 2001 From: Pablo Marquez Tello Date: Tue, 19 Sep 2023 14:24:31 +0100 Subject: L2Norm changes to enable fp16 in armv8a multi_isa builds * Code guarded with __ARM_FEATURE_FP16_VECTOR_ARITHMETIC needs to be moved to an fp16.cpp file to allow compilation with -march=armv8.2-a+fp16 * fp16.cpp needs to use the template l2_normalize_x() and l2_normalize_yz which had to be moved from impl.cpp to impl.h * Removed impl.cpp * Partially resolves MLCE-1102 Signed-off-by: Pablo Marquez Tello Change-Id: Id00a823730108293fc712295a178dad80588af30 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10344 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins --- src/cpu/kernels/l2normlayer/generic/neon/impl.cpp | 131 ---------------------- src/cpu/kernels/l2normlayer/generic/neon/impl.h | 93 ++++++++++++++- 2 files changed, 87 insertions(+), 137 deletions(-) delete mode 100644 src/cpu/kernels/l2normlayer/generic/neon/impl.cpp (limited to 'src/cpu') diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.cpp b/src/cpu/kernels/l2normlayer/generic/neon/impl.cpp deleted file mode 100644 index 2886537702..0000000000 --- a/src/cpu/kernels/l2normlayer/generic/neon/impl.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2017-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/common/Registrars.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -template -void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window) -{ - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - const int window_step_x = 16 / data_size_from_type(in->info()->data_type()); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input_it(in, win_collapsed); - Iterator sum_it(sum, win_collapsed); - Iterator output_it(out, win_collapsed); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(input_it.ptr()); - const auto out_ptr = reinterpret_cast(output_it.ptr()); - - const T sum_value = *reinterpret_cast(sum_it.ptr()); - const T norm_value = static_cast(1.f) / std::sqrt(std::max(sum_value, static_cast(epsilon))); - const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{}); - - // Compute elements over vector steps - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - out_ptr[x] = in_ptr[x] * norm_value; - } - }, - input_it, sum_it, output_it); -} - -template -void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) -{ - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - const int window_step_x = 16 / data_size_from_type(in->info()->data_type()); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Window window_sum(win); - window_sum.set(axis, Window::Dimension(0, 0, 0)); - - Iterator input_it(in, win); - Iterator sum_it(sum, window_sum); - Iterator output_it(out, win); - - const auto vec_eps = wrapper::vdup_n(static_cast(epsilon), ExactTagType{}); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(input_it.ptr()); - const auto sum_ptr = reinterpret_cast(sum_it.ptr()); - const auto out_ptr = reinterpret_cast(output_it.ptr()); - - // Compute elements over vector steps - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps)); - wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const T norm_value = static_cast(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast(epsilon))); - out_ptr[x] = in_ptr[x] * norm_value; - } - }, - input_it, sum_it, output_it); -} - -template void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis); -template void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window); - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -template void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis); -template void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window); -#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h index 98391fb3fd..a06cdd33d3 100644 --- a/src/cpu/kernels/l2normlayer/generic/neon/impl.h +++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,21 +24,102 @@ #ifndef SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H #define SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/common/Registrars.h" + #include namespace arm_compute { -class ITensor; -class Window; - namespace cpu { template -void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window); +void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window) +{ + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + const int window_step_x = 16 / data_size_from_type(in->info()->data_type()); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input_it(in, win_collapsed); + Iterator sum_it(sum, win_collapsed); + Iterator output_it(out, win_collapsed); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast(input_it.ptr()); + const auto out_ptr = reinterpret_cast(output_it.ptr()); + + const T sum_value = *reinterpret_cast(sum_it.ptr()); + const T norm_value = static_cast(1.f) / std::sqrt(std::max(sum_value, static_cast(epsilon))); + const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{}); + + // Compute elements over vector steps + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + out_ptr[x] = in_ptr[x] * norm_value; + } + }, + input_it, sum_it, output_it); +} template -void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis); +void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +{ + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + const int window_step_x = 16 / data_size_from_type(in->info()->data_type()); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Window window_sum(win); + window_sum.set(axis, Window::Dimension(0, 0, 0)); + + Iterator input_it(in, win); + Iterator sum_it(sum, window_sum); + Iterator output_it(out, win); + + const auto vec_eps = wrapper::vdup_n(static_cast(epsilon), ExactTagType{}); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast(input_it.ptr()); + const auto sum_ptr = reinterpret_cast(sum_it.ptr()); + const auto out_ptr = reinterpret_cast(output_it.ptr()); + + // Compute elements over vector steps + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps)); + wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const T norm_value = static_cast(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast(epsilon))); + out_ptr[x] = in_ptr[x] * norm_value; + } + }, + input_it, sum_it, output_it); +} } // namespace cpu } // namespace arm_compute #endif //SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H -- cgit v1.2.1