diff options
author | Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-27 17:46:17 +0100 |
---|---|---|
committer | felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-28 12:08:05 +0000 |
commit | afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch) | |
tree | 03bc7d5a762099989b16a656fa8d397b490ed70e /src/core/NEON/kernels/NEFFTRadixStageKernel.cpp | |
parent | bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff) | |
download | ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz |
Apply clang-format on repository
Code is formatted as per a revised clang format configuration
file(not part of this delivery). Version 14.0.6 is used.
Exclusion List:
- files with .cl extension
- files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...)
And the following directories
- compute_kernel_writer/validation/
- tests/
- include/
- src/core/NEON/kernels/convolution/
- src/core/NEON/kernels/arm_gemm/
- src/core/NEON/kernels/arm_conv/
- data/
There will be a follow up for formatting of .cl files and the
files under tests/ and compute_kernel_writer/validation/.
Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEFFTRadixStageKernel.cpp')
-rw-r--r-- | src/core/NEON/kernels/NEFFTRadixStageKernel.cpp | 594 |
1 files changed, 350 insertions, 244 deletions
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp index 44c841f626..4b58a7b9ac 100644 --- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp +++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp @@ -28,10 +28,11 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/traits.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/traits.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "support/ToolchainSupport.h" #include <arm_neon.h> @@ -70,7 +71,7 @@ float32x2_t c_mul_neon(float32x2_t a, float32x2_t b) { using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type; - const float32x2_t mask = { -1.0, 1.0 }; + const float32x2_t mask = {-1.0, 1.0}; const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); @@ -88,7 +89,7 @@ float32x2_t c_mul_neon_img(float32x2_t a, float img_constant) const float a_r = wrapper::vgetlane(a, 0); const float a_i = wrapper::vgetlane(a, 1); - const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant }); + const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant}); return out; } @@ -100,7 +101,8 @@ float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_ return wrapper::vadd(t2, e); } -float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7) +float32x2_t reduce_sum_7( + float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7) { const auto t0 = wrapper::vadd(x1, x2); const auto t1 = wrapper::vadd(x3, x4); @@ -111,7 +113,14 @@ float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32 return wrapper::vadd(t00, t01); } -float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8) +float32x2_t reduce_sum_8(float32x2_t x1, + float32x2_t x2, + float32x2_t x3, + float32x2_t x4, + float32x2_t x5, + float32x2_t x6, + float32x2_t x7, + float32x2_t x8) { const auto t0 = wrapper::vadd(x1, x2); const auto t1 = wrapper::vadd(x3, x4); @@ -141,15 +150,21 @@ void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w, x = wrapper::vadd(a, b); x = wrapper::vadd(x, c); - const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c)); - const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c)); + const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c)); + const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c)); y = z = wrapper::vsub(a, v1); y = wrapper::vadd(y, v2); z = wrapper::vsub(z, v2); } -void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3) +void fft_4(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3) { float32x2_t a = x1; float32x2_t b = c_mul_neon(w, x2); @@ -173,7 +188,15 @@ void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, c x4 = wrapper::vadd(x41, x42); } -void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4) +void fft_5(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3, + const float32x2_t &w4) { const auto a = x1; const auto b = c_mul_neon(w, x2); @@ -181,25 +204,25 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto d = c_mul_neon(w3, x4); const auto e = c_mul_neon(w4, x5); - const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b); - const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b); - const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b); + const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b); + const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b); + const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b); + const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b); - const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c); - const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c); - const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c); + const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c); + const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c); + const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c); + const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c); - const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d); - const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d); + const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d); + const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d); + const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d); + const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d); - const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e); - const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e); - const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e); + const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e); + const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e); + const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e); + const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e); x1 = reduce_sum_5(a, b, c, d, e); x2 = reduce_sum_5(a, b0, c0, d0, e0); @@ -208,9 +231,19 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f x5 = reduce_sum_5(a, b3, c3, d3, e3); } -void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, +void fft_7(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + float32x2_t &x6, + float32x2_t &x7, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3, const float32x2_t &w4, - const float32x2_t &w5, const float32x2_t &w6) + const float32x2_t &w5, + const float32x2_t &w6) { const auto a = x1; const auto b = c_mul_neon(w, x2); @@ -220,47 +253,47 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto f = c_mul_neon(w5, x6); const auto g = c_mul_neon(w6, x7); - const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b); - const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b); - const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b); - const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b); - const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b); - - const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c); - const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c); - const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c); - const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c); - const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c); - - const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d); - const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d); - const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d); - const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d); - - const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e); - const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e); - const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e); - const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e); - const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e); - - const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f); - const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f); - const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f); - const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f); - const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f); - const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f); - - const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g); - const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g); - const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g); - const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g); - const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g); - const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g); + const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b); + const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b); + const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b); + const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b); + const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b); + const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b); + + const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c); + const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c); + const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c); + const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c); + const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c); + const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c); + + const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d); + const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d); + const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d); + const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d); + const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d); + const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d); + + const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e); + const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e); + const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e); + const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e); + const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e); + const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e); + + const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f); + const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f); + const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f); + const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f); + const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f); + const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f); + + const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g); + const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g); + const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g); + const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g); + const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g); + const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g); x1 = reduce_sum_7(a, b, c, d, e, f, g); x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0); @@ -271,9 +304,20 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5); } -void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2, +void fft_8(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + float32x2_t &x6, + float32x2_t &x7, + float32x2_t &x8, + const float32x2_t &w, + const float32x2_t &w2, const float32x2_t &w3, - const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6, + const float32x2_t &w4, + const float32x2_t &w5, + const float32x2_t &w6, const float32x2_t &w7) { const auto a = x1; @@ -285,61 +329,61 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto g = c_mul_neon(w6, x7); const auto h = c_mul_neon(w7, x8); - const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b); - const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b); - const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b); - const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b); - const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b); - const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b); - - const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c); - const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c); - const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c); - const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c); - const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c); - const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c); - - const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d); - const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d); - const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d); - const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d); - const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d); - - const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e); - - const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f); - const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f); - const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f); - const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f); - const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f); - const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f); - const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f); - - const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g); - const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g); - const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g); - const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g); - const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g); - const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g); - const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g); - - const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h); - const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h); - const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h); - const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h); - const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h); - const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h); - const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h); + const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b); + const auto b1 = c_mul_neon(float32x2_t{0, -1}, b); + const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b); + const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b); + const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b); + const auto b5 = c_mul_neon(float32x2_t{0, 1}, b); + const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b); + + const auto c0 = c_mul_neon(float32x2_t{0, -1}, c); + const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c); + const auto c2 = c_mul_neon(float32x2_t{0, 1}, c); + const auto c3 = c_mul_neon(float32x2_t{1, 0}, c); + const auto c4 = c_mul_neon(float32x2_t{0, -1}, c); + const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c); + const auto c6 = c_mul_neon(float32x2_t{0, 1}, c); + + const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d); + const auto d1 = c_mul_neon(float32x2_t{0, 1}, d); + const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d); + const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d); + const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d); + const auto d5 = c_mul_neon(float32x2_t{0, -1}, d); + const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d); + + const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e1 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e3 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e5 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e); + + const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f); + const auto f1 = c_mul_neon(float32x2_t{0, -1}, f); + const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f); + const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f); + const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f); + const auto f5 = c_mul_neon(float32x2_t{0, 1}, f); + const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f); + + const auto g0 = c_mul_neon(float32x2_t{0, 1}, g); + const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g); + const auto g2 = c_mul_neon(float32x2_t{0, -1}, g); + const auto g3 = c_mul_neon(float32x2_t{1, 0}, g); + const auto g4 = c_mul_neon(float32x2_t{0, 1}, g); + const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g); + const auto g6 = c_mul_neon(float32x2_t{0, -1}, g); + + const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h); + const auto h1 = c_mul_neon(float32x2_t{0, 1}, h); + const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h); + const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h); + const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h); + const auto h5 = c_mul_neon(float32x2_t{0, -1}, h); + const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h); x1 = reduce_sum_8(a, b, c, d, e, f, g, h); x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0); @@ -352,18 +396,19 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f } template <bool first_stage> -void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_2_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - auto a = float32x2_t{ 0, 0 }; - auto b = float32x2_t{ 0, 0 }; + auto a = float32x2_t{0, 0}; + auto b = float32x2_t{0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); a = wrapper::vgetlow(ab); @@ -379,7 +424,7 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_2(a, b, w); // Write outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); } @@ -394,12 +439,20 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_2_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -418,20 +471,21 @@ void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template <bool first_stage> -void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_3_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { // Load inputs - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - if(first_stage) + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + if (first_stage) { const auto ab = wrapper::vloadq(in + k); a = wrapper::vgetlow(ab); @@ -447,7 +501,7 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_3(a, b, c, w, w2); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); } @@ -462,14 +516,22 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_3_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -489,21 +551,22 @@ void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template <bool first_stage> -void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_4_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); const auto w3 = c_mul_neon(w2, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - if(first_stage) + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -524,7 +587,7 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_4(a, b, c, d, w, w2, w3); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -542,15 +605,23 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_4_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); const auto w3 = c_mul_neon(w2, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -572,25 +643,26 @@ void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template <bool first_stage> -void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_5_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); const float32x2_t w4 = c_mul_neon(w3, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -613,7 +685,7 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_5(a, b, c, d, e, w, w2, w3, w4); // Store outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -632,16 +704,24 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_5_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); const float32x2_t w4 = c_mul_neon(w3, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -666,10 +746,11 @@ void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template <bool first_stage> -void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_7_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -677,18 +758,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w5 = c_mul_neon(w4, w); const float32x2_t w6 = c_mul_neon(w5, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; - float32x2_t f = { 0, 0 }; - float32x2_t g = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; + float32x2_t f = {0, 0}; + float32x2_t g = {0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -715,7 +796,7 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -737,10 +818,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_7_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -748,7 +837,7 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w5 = c_mul_neon(w4, w); const float32x2_t w6 = c_mul_neon(w5, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -777,10 +866,11 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template <bool first_stage> -void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_8_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -789,20 +879,20 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w6 = c_mul_neon(w5, w); const float32x2_t w7 = c_mul_neon(w6, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { // Load inputs - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; - float32x2_t f = { 0, 0 }; - float32x2_t g = { 0, 0 }; - float32x2_t h = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; + float32x2_t f = {0, 0}; + float32x2_t g = {0, 0}; + float32x2_t h = {0, 0}; // Base-case prime transform - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -834,7 +924,7 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7); // Store outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -858,10 +948,18 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_8_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -870,7 +968,7 @@ void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w6 = c_mul_neon(w5, w); const float32x2_t w7 = c_mul_neon(w6, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -908,7 +1006,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_UNUSED(config); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -917,11 +1015,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) { ARM_COMPUTE_UNUSED(config); - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output, *input); } @@ -942,7 +1041,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo // FFT table axis 0: [radix, first_stage] static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0; - if(fft_table_axis0.empty()) + if (fft_table_axis0.empty()) { fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>; fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>; @@ -967,7 +1066,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo // FFT table axis 1: [radix, first_stage] static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1; - if(fft_table_axis1.empty()) + if (fft_table_axis1.empty()) { fft_table_axis1[2] = &fft_radix_2_axes_1; fft_table_axis1[3] = &fft_radix_3_axes_1; @@ -985,12 +1084,13 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT ARM_COMPUTE_ERROR_ON_NULLPTR(input); // Output auto inizialitation if not yet initialized - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output->info(), *input->info()->clone()); } - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); _input = input; _output = (output == nullptr) ? input : output; @@ -998,7 +1098,7 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT _axis = config.axis; _radix = config.radix; - switch(config.axis) + switch (config.axis) { case 0: set_radix_stage_axis0(config); @@ -1012,26 +1112,28 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT } // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config); + auto win_config = + validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); } -Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config) +Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const FFTRadixStageKernelInfo &config) { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (run_in_place) ? nullptr : output->clone().get(), - config) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config) + .first); return Status{}; } std::set<unsigned int> NEFFTRadixStageKernel::supported_radix() { - return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 }; + return std::set<unsigned int>{2, 3, 4, 5, 7, 8}; } void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info) @@ -1049,28 +1151,32 @@ void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info) // Precompute FFT constants const unsigned int NxRadix = _radix * _Nx; const float alpha = 2.0f * kPi / float(NxRadix); - const float32x2_t w_m{ cosf(alpha), -sinf(alpha) }; + const float32x2_t w_m{cosf(alpha), -sinf(alpha)}; - if(_axis == 0) + if (_axis == 0) { const unsigned int N = _input->info()->dimension(0); - execute_window_loop(input_window, [&](const Coordinates &) - { - _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N); - }, - in, out); + execute_window_loop( + input_window, + [&](const Coordinates &) { + _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, + N); + }, + in, out); } else { const unsigned int N = _input->info()->dimension(0); const unsigned int M = _input->info()->dimension(1); - execute_window_loop(input_window, [&](const Coordinates &) - { - _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, M, - _input->info()->padding().right + _input->info()->padding().left, - _output->info()->padding().right + _output->info()->padding().left); - }, - in, out); + execute_window_loop( + input_window, + [&](const Coordinates &) + { + _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, + M, _input->info()->padding().right + _input->info()->padding().left, + _output->info()->padding().right + _output->info()->padding().left); + }, + in, out); } ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); |