From 14c4e0f1c3e3bfebb33eb1ceb8156b3a090f4d11 Mon Sep 17 00:00:00 2001 From: giuros01 Date: Tue, 26 Mar 2019 17:44:40 +0000 Subject: COMPMID-1961: Implement FFT (1D) on NEON Change-Id: I0bea3bfbc3b0cd9e8c9a0e0f6f430640573f08d1 Signed-off-by: giuros01 Reviewed-on: https://review.mlplatform.org/c/996 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas --- arm_compute/core/NEON/NEKernels.h | 2 + .../core/NEON/kernels/NEFFTDigitReverseKernel.h | 83 ++ .../core/NEON/kernels/NEFFTRadixStageKernel.h | 98 +++ arm_compute/runtime/NEON/NEFunctions.h | 1 + arm_compute/runtime/NEON/functions/NEFFT1D.h | 79 ++ src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp | 114 +++ src/core/NEON/kernels/NEFFTRadixStageKernel.cpp | 865 +++++++++++++++++++++ src/runtime/NEON/functions/NEFFT1D.cpp | 112 +++ tests/validation/NEON/FFT.cpp | 135 ++++ 9 files changed, 1489 insertions(+) create mode 100644 arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h create mode 100644 arm_compute/runtime/NEON/functions/NEFFT1D.h create mode 100644 src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp create mode 100644 src/core/NEON/kernels/NEFFTRadixStageKernel.cpp create mode 100644 src/runtime/NEON/functions/NEFFT1D.cpp create mode 100644 tests/validation/NEON/FFT.cpp diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index 1ce3821e81..b8ae467c6d 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -62,6 +62,8 @@ #include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h" #include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h" #include "arm_compute/core/NEON/kernels/NEErodeKernel.h" +#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h" +#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h" #include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h" #include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h" #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" diff --git a/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h b/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h new file mode 100644 index 0000000000..84d55fd8f4 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H__ +#define __ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +/** Interface for the digit reverse operation kernel. */ +class NEFFTDigitReverseKernel : public INEKernel +{ +public: + const char *name() const override + { + return "NEFFTDigitReverseKernel"; + } + /** Constructor */ + NEFFTDigitReverseKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFFTDigitReverseKernel(const NEFFTDigitReverseKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFFTDigitReverseKernel &operator=(const NEFFTDigitReverseKernel &) = delete; + /** Default Move Constructor. */ + NEFFTDigitReverseKernel(NEFFTDigitReverseKernel &&) = default; + /** Default move assignment operator */ + NEFFTDigitReverseKernel &operator=(NEFFTDigitReverseKernel &&) = default; + /** Default destructor */ + ~NEFFTDigitReverseKernel() = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] idx Digit reverse index tensor. Data type supported: U32 + * @param[in] axis Axis to perform digit reverse on. + */ + void configure(const ITensor *input, ITensor *output, const ITensor *idx, unsigned int axis); + /** Static function to check if given info will lead to a valid configuration of @ref NEFFTDigitReverseKernel + * + * @param[in] input Source tensor info. Data types supported: F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] idx Digit reverse index tensor info. Data type supported: U32 + * @param[in] axis Axis to perform digit reverse on. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, unsigned int axis); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + ITensor *_output; + const ITensor *_idx; + unsigned int _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h b/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h new file mode 100644 index 0000000000..a4c4be6f35 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H__ +#define __ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +#include "arm_compute/core/KernelDescriptors.h" + +#include + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +/** Interface for the FFT kernel. */ +class NEFFTRadixStageKernel : public INEKernel +{ +public: + const char *name() const override + { + return "NEFFTRadixStageKernel"; + } + /** Constructor */ + NEFFTRadixStageKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFFTRadixStageKernel(const NEFFTRadixStageKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFFTRadixStageKernel &operator=(const NEFFTRadixStageKernel &) = delete; + /** Default Move Constructor. */ + NEFFTRadixStageKernel(NEFFTRadixStageKernel &&) = default; + /** Default move assignment operator */ + NEFFTRadixStageKernel &operator=(NEFFTRadixStageKernel &&) = default; + /** Default destructor */ + ~NEFFTRadixStageKernel() = default; + /** Set the input and output tensors. + * + * @note If the output tensor is nullptr, the FFT will be performed in-place + * + * @param[in,out] input Source tensor. Data types supported: F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] config FFT descriptor metadata. + */ + void configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config); + /** Static function to check if given info will lead to a valid configuration of @ref NEFFTRadixStageKernel + * + * @param[in] input Source tensor info. Data types supported: F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] config FFT descriptor metadata. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config); + /** Returns the radix that are support by the FFT kernel + * + * @return A set of supported radix + */ + static std::set supported_radix(); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + ITensor *_input; + ITensor *_output; + bool _run_in_place; + unsigned int _Nx; + + template + void set_radix_stage_fun(unsigned int radix); + + using FFTFunctionPointerInPlace = std::function; + FFTFunctionPointerInPlace _func; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEFFTKERNEL_H__ */ diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h index 432c751308..d8f54ea231 100644 --- a/arm_compute/runtime/NEON/NEFunctions.h +++ b/arm_compute/runtime/NEON/NEFunctions.h @@ -63,6 +63,7 @@ #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h" #include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h" #include "arm_compute/runtime/NEON/functions/NEErode.h" +#include "arm_compute/runtime/NEON/functions/NEFFT1D.h" #include "arm_compute/runtime/NEON/functions/NEFastCorners.h" #include "arm_compute/runtime/NEON/functions/NEFillBorder.h" #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h" diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h new file mode 100644 index 0000000000..9b5ada746a --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFFT1D_H__ +#define __ARM_COMPUTE_NEFFT1D_H__ + +#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h" +#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h" +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +// Forward declaration +class ITensor; + +/** Basic function to execute one dimensional FFT. This function calls the following OpenCL kernels: + * + * -# @ref CLFFTDigitReverseKernel Performs digit reverse + * -# @ref NEFFTRadixStageKernel A list of FFT kernels depending on the radix decomposition + */ +class NEFFT1D : public IFunction +{ +public: + /** Default Constructor */ + NEFFT1D(std::shared_ptr memory_manager = nullptr); + /** Initialise the function's source, destinations and border mode. + * + * @param[in] input Source tensor. Data types supported: F32. + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. + * @param[in] config FFT related configuration + */ + void configure(const ITensor *input, ITensor *output, const FFT1DInfo &config); + /** Static function to check if given info will lead to a valid configuration of @ref CLFFT1D. + * + * @param[in] input Source tensor info. Data types supported: F32. + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. + * @param[in] config FFT related configuration + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config); + + // Inherited methods overridden: + void run() override; + +protected: + MemoryGroup _memory_group; + Tensor _digit_reversed_input; + Tensor _digit_reverse_indices; + NEFFTDigitReverseKernel _digit_reverse_kernel; + std::vector _fft_kernels; + unsigned int _n_ffts; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEFFT1D_H__ */ diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp new file mode 100644 index 0000000000..845fcef4f3 --- /dev/null +++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, unsigned int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32); + ARM_COMPUTE_RETURN_ERROR_ON(axis != 0); + + // Checks performed when output is configured + if((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, unsigned int axis) +{ + ARM_COMPUTE_UNUSED(idx, axis); + + auto_init_if_empty(*output, *input); + + Window win = calculate_max_window(*output, Steps()); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + return std::make_pair(Status{}, win); +} +} // namespace + +NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() + : _input(nullptr), _output(nullptr), _idx(nullptr), _axis(0) +{ +} + +void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, unsigned int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), axis)); + + _input = input; + _output = output; + _idx = idx; + _axis = axis; + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info(), idx->info(), axis); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, unsigned int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), axis).first); + return Status{}; +} + +void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + Iterator out(_output, window); + const size_t element_size = _input->info()->element_size(); + + execute_window_loop(window, [&](const Coordinates & id) + { + unsigned int in_index_1d = *reinterpret_cast(_idx->ptr_to_element(Coordinates(id.x()))); + + auto reverse_id = id; + reverse_id.set(_axis, in_index_1d); + + memcpy(out.ptr(), _input->ptr_to_element(reverse_id), 2 * element_size); + + }, + out); + + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); +} +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp new file mode 100644 index 0000000000..b264590791 --- /dev/null +++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp @@ -0,0 +1,865 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/traits.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace +{ +constexpr float PI = 3.141592653589793f; + +float32x2_t c_mul_neon(float32x2_t a, float32x2_t b) +{ + float32x2_t tmp = wrapper::vmul(a, b); + + const float P1 = wrapper::vgetlane(tmp, 0); + const float P2 = wrapper::vgetlane(tmp, 1); + + const float a_r = wrapper::vgetlane(a, 0); + const float a_i = wrapper::vgetlane(a, 1); + const float b_r = wrapper::vgetlane(b, 0); + const float b_i = wrapper::vgetlane(b, 1); + + const float P3 = (a_r + a_i) * (b_r + b_i); + float32x2_t out = { P1 - P2, P3 - P2 - P1 }; + return out; +} + +float32x2_t c_mul_neon_img(float32x2_t a, float img_constant) +{ + const float a_r = wrapper::vgetlane(a, 0); + const float a_i = wrapper::vgetlane(a, 1); + + const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant }); + return out; +} + +float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_t d, float32x2_t e) +{ + const auto t0 = wrapper::vadd(a, b); + const auto t1 = wrapper::vadd(c, d); + const auto t2 = wrapper::vadd(t0, t1); + return wrapper::vadd(t2, e); +} + +float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7) +{ + const auto t0 = wrapper::vadd(x1, x2); + const auto t1 = wrapper::vadd(x3, x4); + const auto t2 = wrapper::vadd(x5, x6); + const auto t00 = wrapper::vadd(t0, t1); + const auto t01 = wrapper::vadd(t2, x7); + + return wrapper::vadd(t00, t01); +} + +float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8) +{ + const auto t0 = wrapper::vadd(x1, x2); + const auto t1 = wrapper::vadd(x3, x4); + const auto t2 = wrapper::vadd(x5, x6); + const auto t3 = wrapper::vadd(x7, x8); + const auto t00 = wrapper::vadd(t0, t1); + const auto t01 = wrapper::vadd(t2, t3); + + return wrapper::vadd(t00, t01); +} + +void fft_2(float32x2_t &x, float32x2_t &y, float32x2_t &w) +{ + float32x2_t a = x; + float32x2_t b = c_mul_neon(w, y); + + x = wrapper::vadd(a, b); + y = wrapper::vsub(a, b); +} + +constexpr float sqrt3div2 = 0.866025403784438; +void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w, const float32x2_t &w2) +{ + float32x2_t a = x; + float32x2_t b = c_mul_neon(w, y); + float32x2_t c = c_mul_neon(w2, z); + + x = wrapper::vadd(a, b); + x = wrapper::vadd(x, c); + + const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c)); + const auto v2 = c_mul_neon(float32x2_t{ 0.f, -sqrt3div2 }, wrapper::vsub(b, c)); + + y = z = wrapper::vsub(a, v1); + y = wrapper::vadd(y, v2); + z = wrapper::vsub(z, v2); +} + +void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3) +{ + float32x2_t a = x1; + float32x2_t b = c_mul_neon(w, x2); + float32x2_t c = c_mul_neon(w2, x3); + float32x2_t d = c_mul_neon(w3, x4); + + const auto x11 = wrapper::vadd(a, b); + const auto x12 = wrapper::vadd(c, d); + x1 = wrapper::vadd(x11, x12); + + const auto x21 = wrapper::vadd(a, c_mul_neon_img(b, -1)); + const auto x22 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, 1.f)); + x2 = wrapper::vadd(x21, x22); + + const auto x31 = wrapper::vadd(a, wrapper::vneg(b)); + const auto x32 = wrapper::vadd(c, wrapper::vneg(d)); + x3 = wrapper::vadd(x31, x32); + + const auto x41 = wrapper::vadd(a, c_mul_neon_img(b, 1)); + const auto x42 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, -1)); + x4 = wrapper::vadd(x41, x42); +} + +constexpr float W5_0 = 0.30901699437494f; +constexpr float W5_1 = 0.95105651629515f; +constexpr float W5_2 = 0.80901699437494f; +constexpr float W5_3 = 0.58778525229247f; +void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4) +{ + const auto a = x1; + const auto b = c_mul_neon(w, x2); + const auto c = c_mul_neon(w2, x3); + const auto d = c_mul_neon(w3, x4); + const auto e = c_mul_neon(w4, x5); + + const auto b0 = c_mul_neon(float32x2_t{ W5_0, -W5_1 }, b); + const auto b1 = c_mul_neon(float32x2_t{ -W5_2, -W5_3 }, b); + const auto b2 = c_mul_neon(float32x2_t{ -W5_2, W5_3 }, b); + const auto b3 = c_mul_neon(float32x2_t{ W5_0, W5_1 }, b); + + const auto c0 = c_mul_neon(float32x2_t{ -W5_2, -W5_3 }, c); + const auto c1 = c_mul_neon(float32x2_t{ W5_0, W5_1 }, c); + const auto c2 = c_mul_neon(float32x2_t{ W5_0, -W5_1 }, c); + const auto c3 = c_mul_neon(float32x2_t{ -W5_2, W5_3 }, c); + + const auto d0 = c_mul_neon(float32x2_t{ -W5_2, W5_3 }, d); + const auto d1 = c_mul_neon(float32x2_t{ W5_0, -W5_1 }, d); + const auto d2 = c_mul_neon(float32x2_t{ W5_0, W5_1 }, d); + const auto d3 = c_mul_neon(float32x2_t{ -W5_2, -W5_3 }, d); + + const auto e0 = c_mul_neon(float32x2_t{ W5_0, W5_1 }, e); + const auto e1 = c_mul_neon(float32x2_t{ -W5_2, W5_3 }, e); + const auto e2 = c_mul_neon(float32x2_t{ -W5_2, -W5_3 }, e); + const auto e3 = c_mul_neon(float32x2_t{ W5_0, -W5_1 }, e); + + x1 = reduce_sum_5(a, b, c, d, e); + x2 = reduce_sum_5(a, b0, c0, d0, e0); + x3 = reduce_sum_5(a, b1, c1, d1, e1); + x4 = reduce_sum_5(a, b2, c2, d2, e2); + x5 = reduce_sum_5(a, b3, c3, d3, e3); +} + +constexpr float W7_0 = 0.62348980185873f; +constexpr float W7_1 = 0.78183148246802f; +constexpr float W7_2 = 0.22252093395631f; +constexpr float W7_3 = 0.97492791218182f; +constexpr float W7_4 = 0.90096886790241f; +constexpr float W7_5 = 0.43388373911755f; +void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, + const float32x2_t &w4, + const float32x2_t &w5, const float32x2_t &w6) +{ + const auto a = x1; + const auto b = c_mul_neon(w, x2); + const auto c = c_mul_neon(w2, x3); + const auto d = c_mul_neon(w3, x4); + const auto e = c_mul_neon(w4, x5); + const auto f = c_mul_neon(w5, x6); + const auto g = c_mul_neon(w6, x7); + + const auto b0 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, b); + const auto b1 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, b); + const auto b2 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, b); + const auto b3 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, b); + const auto b4 = c_mul_neon(float32x2_t{ -W7_2, W7_3 }, b); + const auto b5 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, b); + + const auto c0 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, c); + const auto c1 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, c); + const auto c2 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, c); + const auto c3 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, c); + const auto c4 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, c); + const auto c5 = c_mul_neon(float32x2_t{ -W7_2, W7_3 }, c); + + const auto d0 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, d); + const auto d1 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, d); + const auto d2 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, d); + const auto d3 = c_mul_neon(float32x2_t{ -W7_2, +W7_3 }, d); + const auto d4 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, d); + const auto d5 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, d); + + const auto e0 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, e); + const auto e1 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, e); + const auto e2 = c_mul_neon(float32x2_t{ -W7_2, W7_3 }, e); + const auto e3 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, e); + const auto e4 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, e); + const auto e5 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, e); + + const auto f0 = c_mul_neon(float32x2_t{ -W7_2, W7_3 }, f); + const auto f1 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, f); + const auto f2 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, f); + const auto f3 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, f); + const auto f4 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, f); + const auto f5 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, f); + + const auto g0 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, g); + const auto g1 = c_mul_neon(float32x2_t{ -W7_2, W7_3 }, g); + const auto g2 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, g); + const auto g3 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, g); + const auto g4 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, g); + const auto g5 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, g); + + x1 = reduce_sum_7(a, b, c, d, e, f, g); + x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0); + x3 = reduce_sum_7(a, b1, c1, d1, e1, f1, g1); + x4 = reduce_sum_7(a, b2, c2, d2, e2, f2, g2); + x5 = reduce_sum_7(a, b3, c3, d3, e3, f3, g3); + x6 = reduce_sum_7(a, b4, c4, d4, e4, f4, g4); + x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5); +} + +constexpr float sqrt2div2 = 0.707106781186548; +void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2, + const float32x2_t &w3, + const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6, + const float32x2_t &w7) +{ + const auto a = x1; + const auto b = c_mul_neon(w, x2); + const auto c = c_mul_neon(w2, x3); + const auto d = c_mul_neon(w3, x4); + const auto e = c_mul_neon(w4, x5); + const auto f = c_mul_neon(w5, x6); + const auto g = c_mul_neon(w6, x7); + const auto h = c_mul_neon(w7, x8); + + const auto b0 = c_mul_neon(float32x2_t{ sqrt2div2, -sqrt2div2 }, b); + const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b); + const auto b2 = c_mul_neon(float32x2_t{ -sqrt2div2, -sqrt2div2 }, b); + const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b); + const auto b4 = c_mul_neon(float32x2_t{ -sqrt2div2, sqrt2div2 }, b); + const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b); + const auto b6 = c_mul_neon(float32x2_t{ sqrt2div2, sqrt2div2 }, b); + + const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c); + const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c); + const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c); + const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c); + const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c); + const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c); + const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c); + + const auto d0 = c_mul_neon(float32x2_t{ -sqrt2div2, -sqrt2div2 }, d); + const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d); + const auto d2 = c_mul_neon(float32x2_t{ sqrt2div2, -sqrt2div2 }, d); + const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d); + const auto d4 = c_mul_neon(float32x2_t{ sqrt2div2, sqrt2div2 }, d); + const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d); + const auto d6 = c_mul_neon(float32x2_t{ -sqrt2div2, sqrt2div2 }, d); + + const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e); + const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e); + const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e); + const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e); + const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e); + const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e); + const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e); + + const auto f0 = c_mul_neon(float32x2_t{ -sqrt2div2, sqrt2div2 }, f); + const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f); + const auto f2 = c_mul_neon(float32x2_t{ sqrt2div2, sqrt2div2 }, f); + const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f); + const auto f4 = c_mul_neon(float32x2_t{ sqrt2div2, -sqrt2div2 }, f); + const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f); + const auto f6 = c_mul_neon(float32x2_t{ -sqrt2div2, -sqrt2div2 }, f); + + const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g); + const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g); + const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g); + const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g); + const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g); + const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g); + const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g); + + const auto h0 = c_mul_neon(float32x2_t{ sqrt2div2, sqrt2div2 }, h); + const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h); + const auto h2 = c_mul_neon(float32x2_t{ -sqrt2div2, sqrt2div2 }, h); + const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h); + const auto h4 = c_mul_neon(float32x2_t{ -sqrt2div2, -sqrt2div2 }, h); + const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h); + const auto h6 = c_mul_neon(float32x2_t{ sqrt2div2, -sqrt2div2 }, h); + + x1 = reduce_sum_8(a, b, c, d, e, f, g, h); + x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0); + x3 = reduce_sum_8(a, b1, c1, d1, e1, f1, g1, h1); + x4 = reduce_sum_8(a, b2, c2, d2, e2, f2, g2, h2); + x5 = reduce_sum_8(a, b3, c3, d3, e3, f3, g3, h3); + x6 = reduce_sum_8(a, b4, c4, d4, e4, f4, g4, h4); + x7 = reduce_sum_8(a, b5, c5, d5, e5, f5, g5, h5); + x8 = reduce_sum_8(a, b6, c6, d6, e6, f6, g6, h6); +} + +template +void fft_radix_2_axes_0(float *X, float *x, unsigned int Nx, unsigned int N) +{ + unsigned int Nx2 = 2 * Nx; + float alpha = 2 * PI / Nx2; + + float32x2_t w{ 1, 0 }; + const float32x2_t w_m{ cosf(alpha), -sinf(alpha) }; + + for(unsigned int j = 0; j < Nx; j++) + { + for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx2) + { + auto a = float32x2_t{ 0, 0 }; + auto b = float32x2_t{ 0, 0 }; + + // Load inputs + if(first_stage) + { + const auto ab = wrapper::vloadq(x + k); + a = wrapper::vgetlow(ab); + b = wrapper::vgethigh(ab); + } + else + { + a = wrapper::vload(x + k); + b = wrapper::vload(x + k + 2 * Nx); + } + + // Base-case prime transform + fft_2(a, b, w); + + // Write outputs + if(first_stage) + { + wrapper::vstore(X + k, wrapper::vcombine(a, b)); + } + else + { + wrapper::vstore(X + k, a); + wrapper::vstore(X + k + 2 * Nx, b); + } + } + + w = c_mul_neon(w, w_m); + } +} + +template +void fft_radix_3_axes_0(float *X, float *x, unsigned int Nx, unsigned int N) +{ + const unsigned int Nx3 = 3 * Nx; + const float alpha = 2 * PI / float(Nx3); + float32x2_t w{ 1, 0 }; + const float32x2_t w_m{ cosf(alpha), -sinf(alpha) }; + + for(unsigned int j = 0; j < Nx; j++) + { + const auto w2 = c_mul_neon(w, w); + + for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx3) + { + // Load inputs + float32x2_t a = { 0, 0 }; + float32x2_t b = { 0, 0 }; + float32x2_t c = { 0, 0 }; + if(first_stage) + { + const auto ab = wrapper::vloadq(x + k); + a = wrapper::vgetlow(ab); + b = wrapper::vgethigh(ab); + } + else + { + a = wrapper::vload(x + k); + b = wrapper::vload(x + k + 2 * Nx); + } + c = wrapper::vload(x + k + 4 * Nx); + + // Base-case prime transform + fft_3(a, b, c, w, w2); + + if(first_stage) + { + wrapper::vstore(X + k, wrapper::vcombine(a, b)); + } + else + { + wrapper::vstore(X + k, a); + wrapper::vstore(X + k + 2 * Nx, b); + } + wrapper::vstore(X + k + 4 * Nx, c); + } + w = c_mul_neon(w, w_m); + } +} + +template +void fft_radix_4_axes_0(float *X, float *x, unsigned int Nx, unsigned int N) +{ + unsigned int Nx4 = 4 * Nx; + const float alpha = 2 * PI / float(Nx4); + + float32x2_t w{ 1, 0 }; + float32x2_t w_m{ cosf(alpha), -sinf(alpha) }; + + for(unsigned int j = 0; j < Nx; j++) + { + const auto w2 = c_mul_neon(w, w); + const auto w3 = c_mul_neon(w2, w); + + for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx4) + { + float32x2_t a = { 0, 0 }; + float32x2_t b = { 0, 0 }; + float32x2_t c = { 0, 0 }; + float32x2_t d = { 0, 0 }; + if(first_stage) + { + const auto ab = wrapper::vloadq(x + k); + const auto cd = wrapper::vloadq(x + k + 4 * Nx); + a = wrapper::vgetlow(ab); + b = wrapper::vgethigh(ab); + c = wrapper::vgetlow(cd); + d = wrapper::vgethigh(cd); + } + else + { + // Load inputs + a = wrapper::vload(x + k); + b = wrapper::vload(x + k + 2 * Nx); + c = wrapper::vload(x + k + 4 * Nx); + d = wrapper::vload(x + k + 6 * Nx); + } + + // Base-case prime transform + fft_4(a, b, c, d, w, w2, w3); + + if(first_stage) + { + wrapper::vstore(X + k, wrapper::vcombine(a, b)); + wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d)); + } + else + { + wrapper::vstore(X + k, a); + wrapper::vstore(X + k + 2 * Nx, b); + wrapper::vstore(X + k + 4 * Nx, c); + wrapper::vstore(X + k + 6 * Nx, d); + } + } + + w = c_mul_neon(w, w_m); + } +} + +template +void fft_radix_5_axes_0(float *X, float *x, unsigned int Nx, unsigned int N) +{ + unsigned int Nx5 = 5 * Nx; + const float alpha = 2 * PI / float(Nx5); + + float32x2_t w{ 1, 0 }; + float32x2_t w_m{ cosf(alpha), -sinf(alpha) }; + + for(unsigned int j = 0; j < Nx; j++) + { + const float32x2_t w2 = c_mul_neon(w, w); + const float32x2_t w3 = c_mul_neon(w2, w); + const float32x2_t w4 = c_mul_neon(w3, w); + + for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx5) + { + float32x2_t a = { 0, 0 }; + float32x2_t b = { 0, 0 }; + float32x2_t c = { 0, 0 }; + float32x2_t d = { 0, 0 }; + float32x2_t e = { 0, 0 }; + + // Load inputs + if(first_stage) + { + const auto ab = wrapper::vloadq(x + k); + const auto cd = wrapper::vloadq(x + k + 4 * Nx); + + a = wrapper::vgetlow(ab); + b = wrapper::vgethigh(ab); + c = wrapper::vgetlow(cd); + d = wrapper::vgethigh(cd); + } + else + { + a = wrapper::vload(x + k); + b = wrapper::vload(x + k + 2 * Nx); + c = wrapper::vload(x + k + 4 * Nx); + d = wrapper::vload(x + k + 6 * Nx); + } + e = wrapper::vload(x + k + 8 * Nx); + + // Base-case prime transform + fft_5(a, b, c, d, e, w, w2, w3, w4); + + // Store outputs + if(first_stage) + { + wrapper::vstore(X + k, wrapper::vcombine(a, b)); + wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d)); + } + else + { + wrapper::vstore(X + k, a); + wrapper::vstore(X + k + 2 * Nx, b); + wrapper::vstore(X + k + 4 * Nx, c); + wrapper::vstore(X + k + 6 * Nx, d); + } + wrapper::vstore(X + k + 8 * Nx, e); + } + + w = c_mul_neon(w, w_m); + } +} + +template +void fft_radix_7_axes_0(float *X, float *x, unsigned int Nx, unsigned int N) +{ + unsigned int Nx7 = 7 * Nx; + const float alpha = 2 * PI / float(Nx7); + + float32x2_t w{ 1, 0 }; + float32x2_t w_m{ cosf(alpha), -sinf(alpha) }; + + for(unsigned int j = 0; j < Nx; j++) + { + const float32x2_t w2 = c_mul_neon(w, w); + const float32x2_t w3 = c_mul_neon(w2, w); + const float32x2_t w4 = c_mul_neon(w3, w); + const float32x2_t w5 = c_mul_neon(w4, w); + const float32x2_t w6 = c_mul_neon(w5, w); + + for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx7) + { + float32x2_t a = { 0, 0 }; + float32x2_t b = { 0, 0 }; + float32x2_t c = { 0, 0 }; + float32x2_t d = { 0, 0 }; + float32x2_t e = { 0, 0 }; + float32x2_t f = { 0, 0 }; + float32x2_t g = { 0, 0 }; + + // Load inputs + if(first_stage) + { + const auto ab = wrapper::vloadq(x + k); + const auto cd = wrapper::vloadq(x + k + 4 * Nx); + const auto ef = wrapper::vloadq(x + k + 8 * Nx); + + a = wrapper::vgetlow(ab); + b = wrapper::vgethigh(ab); + c = wrapper::vgetlow(cd); + d = wrapper::vgethigh(cd); + e = wrapper::vgetlow(ef); + f = wrapper::vgethigh(ef); + } + else + { + a = wrapper::vload(x + k); + b = wrapper::vload(x + k + 2 * Nx); + c = wrapper::vload(x + k + 4 * Nx); + d = wrapper::vload(x + k + 6 * Nx); + e = wrapper::vload(x + k + 8 * Nx); + f = wrapper::vload(x + k + 10 * Nx); + } + g = wrapper::vload(x + k + 12 * Nx); + + // Base-case prime transform + fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6); + + if(first_stage) + { + wrapper::vstore(X + k, wrapper::vcombine(a, b)); + wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d)); + wrapper::vstore(X + k + 8 * Nx, wrapper::vcombine(e, f)); + } + else + { + wrapper::vstore(X + k, a); + wrapper::vstore(X + k + 2 * Nx, b); + wrapper::vstore(X + k + 4 * Nx, c); + wrapper::vstore(X + k + 6 * Nx, d); + wrapper::vstore(X + k + 8 * Nx, e); + wrapper::vstore(X + k + 10 * Nx, f); + } + wrapper::vstore(X + k + 12 * Nx, g); + } + + w = c_mul_neon(w, w_m); + } +} + +template +void fft_radix_8_axes_0(float *X, float *x, unsigned int Nx, unsigned int N) +{ + unsigned int Nx8 = 8 * Nx; + const float alpha = 2 * PI / float(Nx8); + + float32x2_t w{ 1, 0 }; + const float32x2_t w_m{ cosf(alpha), -sinf(alpha) }; + + for(unsigned int j = 0; j < Nx; j++) + { + const float32x2_t w2 = c_mul_neon(w, w); + const float32x2_t w3 = c_mul_neon(w2, w); + const float32x2_t w4 = c_mul_neon(w3, w); + const float32x2_t w5 = c_mul_neon(w4, w); + const float32x2_t w6 = c_mul_neon(w5, w); + const float32x2_t w7 = c_mul_neon(w6, w); + + for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx8) + { + // Load inputs + float32x2_t a = { 0, 0 }; + float32x2_t b = { 0, 0 }; + float32x2_t c = { 0, 0 }; + float32x2_t d = { 0, 0 }; + float32x2_t e = { 0, 0 }; + float32x2_t f = { 0, 0 }; + float32x2_t g = { 0, 0 }; + float32x2_t h = { 0, 0 }; + + // Base-case prime transform + if(first_stage) + { + const auto ab = wrapper::vloadq(x + k); + const auto cd = wrapper::vloadq(x + k + 4 * Nx); + const auto ef = wrapper::vloadq(x + k + 8 * Nx); + const auto gh = wrapper::vloadq(x + k + 12 * Nx); + + a = wrapper::vgetlow(ab); + b = wrapper::vgethigh(ab); + c = wrapper::vgetlow(cd); + d = wrapper::vgethigh(cd); + e = wrapper::vgetlow(ef); + f = wrapper::vgethigh(ef); + g = wrapper::vgetlow(gh); + h = wrapper::vgethigh(gh); + } + else + { + a = wrapper::vload(x + k); + b = wrapper::vload(x + k + 2 * Nx); + c = wrapper::vload(x + k + 4 * Nx); + d = wrapper::vload(x + k + 6 * Nx); + e = wrapper::vload(x + k + 8 * Nx); + f = wrapper::vload(x + k + 10 * Nx); + g = wrapper::vload(x + k + 12 * Nx); + h = wrapper::vload(x + k + 14 * Nx); + } + + // Apply twiddle factors + fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7); + + // Store outputs + if(first_stage) + { + wrapper::vstore(X + k, wrapper::vcombine(a, b)); + wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d)); + wrapper::vstore(X + k + 8 * Nx, wrapper::vcombine(e, f)); + wrapper::vstore(X + k + 12 * Nx, wrapper::vcombine(g, h)); + } + else + { + wrapper::vstore(X + k, a); + wrapper::vstore(X + k + 2 * Nx, b); + wrapper::vstore(X + k + 4 * Nx, c); + wrapper::vstore(X + k + 6 * Nx, d); + wrapper::vstore(X + k + 8 * Nx, e); + wrapper::vstore(X + k + 10 * Nx, f); + wrapper::vstore(X + k + 12 * Nx, g); + wrapper::vstore(X + k + 14 * Nx, h); + } + } + + w = c_mul_neon(w, w_m); + } +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(config.axis != 0); + ARM_COMPUTE_RETURN_ERROR_ON(NEFFTRadixStageKernel::supported_radix().count(config.radix) == 0); + + // Checks performed when output is configured + if((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) +{ + if(output != nullptr) + { + auto_init_if_empty(*output, *input); + } + + Window win = calculate_max_window(*input, Steps(config.radix)); + if(output != nullptr) + { + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + } + + return std::make_pair(Status{}, win); +} +} // namespace + +NEFFTRadixStageKernel::NEFFTRadixStageKernel() + : _input(nullptr), _output(nullptr), _run_in_place(false), _Nx(0), _func() +{ +} + +template +void NEFFTRadixStageKernel::set_radix_stage_fun(unsigned int radix) +{ + switch(radix) + { + case 2: + _func = &fft_radix_2_axes_0; + break; + case 3: + _func = &fft_radix_3_axes_0; + break; + case 4: + _func = &fft_radix_4_axes_0; + break; + case 5: + _func = &fft_radix_5_axes_0; + break; + case 7: + _func = &fft_radix_7_axes_0; + break; + case 8: + _func = &fft_radix_8_axes_0; + break; + default: + ARM_COMPUTE_ERROR("Radix not supported"); + } +} + +void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + // Output auto inizialitation if not yet initialized + if(output != nullptr) + { + auto_init_if_empty(*output->info(), *input->info()->clone()); + } + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); + + _input = input; + _output = output; + _run_in_place = (output == nullptr) || (output == input); + _Nx = config.Nx; + + if(config.is_first_stage) + { + set_radix_stage_fun(config.radix); + } + else + { + set_radix_stage_fun(config.radix); + } + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(), config); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config) +{ + const bool run_in_place = (output == nullptr) || (output == input); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + (run_in_place) ? nullptr : output->clone().get(), + config) + .first); + + return Status{}; +} + +std::set NEFFTRadixStageKernel::supported_radix() +{ + return std::set { 2, 3, 4, 5, 7, 8 }; +} + +void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Window input_window = window; + input_window.set(Window::DimX, 0); + + unsigned int N = _input->info()->dimension(0); + + Iterator in(_input, input_window); + Iterator out(_run_in_place ? _input : _output, input_window); + + execute_window_loop(input_window, [&](const Coordinates &) + { + _func(reinterpret_cast(out.ptr()), reinterpret_cast(in.ptr()), _Nx, N); + }, + in, out); + + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp new file mode 100644 index 0000000000..d3ff674a2a --- /dev/null +++ b/src/runtime/NEON/functions/NEFFT1D.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEFFT1D.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/helpers/fft.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NEFFT1D::NEFFT1D(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _digit_reversed_input(), _digit_reverse_indices(), _digit_reverse_kernel(), _fft_kernels(), _n_ffts(0) +{ +} + +void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &config) +{ + // Decompose size to radix factors + const auto supported_radix = NEFFTRadixStageKernel::supported_radix(); + const unsigned int N = input->info()->tensor_shape()[config.axis]; + const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix); + ARM_COMPUTE_ERROR_ON(decomposed_vector.empty()); + + // Configure digit reverse + TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32); + _digit_reverse_indices.allocator()->init(digit_reverse_indices_info); + _memory_group.manage(&_digit_reversed_input); + _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, config.axis); + + // Create and configure FFT kernels + unsigned int Nx = 1; + _n_ffts = decomposed_vector.size(); + _fft_kernels.resize(_n_ffts); + for(unsigned int i = 0; i < _n_ffts; ++i) + { + const unsigned int radix_for_stage = decomposed_vector.at(i); + + FFTRadixStageKernelInfo fft_kernel_desc; + fft_kernel_desc.axis = config.axis; + fft_kernel_desc.radix = radix_for_stage; + fft_kernel_desc.Nx = Nx; + fft_kernel_desc.is_first_stage = (i == 0); + _fft_kernels[i].configure(&_digit_reversed_input, i == (_n_ffts - 1) ? output : nullptr, fft_kernel_desc); + + Nx *= radix_for_stage; + } + + // Allocate tensors + _digit_reversed_input.allocator()->allocate(); + _digit_reverse_indices.allocator()->allocate(); + + // Init digit reverse indices + const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector); + std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast(_digit_reverse_indices.buffer())); +} + +Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(config.axis != 0); + + // Check if FFT is decomposable + const auto supported_radix = NEFFTRadixStageKernel::supported_radix(); + const unsigned int N = input->tensor_shape()[config.axis]; + const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix); + ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); + + // Checks performed when output is configured + if((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +void NEFFT1D::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + NEScheduler::get().schedule(&_digit_reverse_kernel, Window::DimY); + + for(unsigned int i = 0; i < _n_ffts; ++i) + { + NEScheduler::get().schedule(&_fft_kernels[i], Window::DimY); + } +} +} // namespace arm_compute diff --git a/tests/validation/NEON/FFT.cpp b/tests/validation/NEON/FFT.cpp new file mode 100644 index 0000000000..14fb7e2518 --- /dev/null +++ b/tests/validation/NEON/FFT.cpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/functions/NEFFT1D.h" +#include "arm_compute/runtime/Tensor.h" +#include "tests/NEON/Accessor.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/FFTFixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ +const auto data_types = framework::dataset::make("DataType", { DataType::F32 }); +const auto shapes_1d = framework::dataset::make("TensorShape", { TensorShape(2U, 2U, 3U), TensorShape(3U, 2U, 3U), + TensorShape(4U, 2U, 3U), TensorShape(5U, 2U, 3U), + TensorShape(7U, 2U, 3U), TensorShape(8U, 2U, 3U), + TensorShape(9U, 2U, 3U), TensorShape(25U, 2U, 3U), + TensorShape(49U, 2U, 3U), TensorShape(64U, 2U, 3U), + TensorShape(16U, 2U, 3U), TensorShape(32U, 2U, 3U), + TensorShape(96U, 2U, 2U) + }); + +const auto ActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo", +{ + ActivationLayerInfo(), + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f) +}); + +RelativeTolerance tolerance_f32(0.1f); /**< Relative tolerance value for FP32 */ +constexpr float tolerance_num = 0.07f; /**< Tolerance number */ + +} // namespace +TEST_SUITE(NEON) +TEST_SUITE(FFT1D) + +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(shapes_1d, data_types), + shape, data_type) +{ + // Create tensors + Tensor src = create_tensor(shape, data_type, 2); + Tensor dst = create_tensor(shape, data_type, 2); + + ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + NEFFT1D fft1d; + fft1d.configure(&src, &dst, FFT1DInfo()); + + // Validate valid region + const ValidRegion valid_region = shape_to_valid_region(shape); + validate(src.info()->valid_region(), valid_region); + validate(dst.info()->valid_region(), valid_region); + + // Validate padding + validate(src.info()->padding(), PaddingSize()); + validate(dst.info()->padding(), PaddingSize()); +} + +// *INDENT-OFF* +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( + framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), // Mismatching data types + TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), // Mismatching shapes + TensorInfo(TensorShape(32U, 13U, 2U), 3, DataType::F32), // Invalid channels + TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), // Unsupported axis + TensorInfo(TensorShape(11U, 13U, 2U), 2, DataType::F32), // Undecomposable FFT + TensorInfo(TensorShape(25U, 13U, 2U), 2, DataType::F32), + }), + framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F16), + TensorInfo(TensorShape(16U, 13U, 2U), 2, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), + TensorInfo(TensorShape(11U, 13U, 2U), 2, DataType::F32), + TensorInfo(TensorShape(25U, 13U, 2U), 2, DataType::F32), + })), + framework::dataset::make("Axis", { 0, 0, 0, 2, 0, 0 })), + framework::dataset::make("Expected", { false, false, false, false, false, true })), + input_info, output_info, axis, expected) +{ + FFT1DInfo desc; + desc.axis = axis; + const Status s = NEFFT1D::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), desc); + ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS); +} +// clang-format on +// *INDENT-ON* + +template +using NEFFT1DFixture = FFTValidationFixture; + +TEST_SUITE(Float) +TEST_SUITE(FP32) +FIXTURE_DATA_TEST_CASE(RunSmall, NEFFT1DFixture, framework::DatasetMode::ALL, combine(shapes_1d, framework::dataset::make("DataType", DataType::F32))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_f32, tolerance_num); +} +TEST_SUITE_END() // FP32 +TEST_SUITE_END() // Float + +TEST_SUITE_END() // FFT1D +TEST_SUITE_END() // NEON +} // namespace validation +} // namespace test +} // namespace arm_compute -- cgit v1.2.1