aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgiuros01 <giuseppe.rossini@arm.com>2019-03-26 17:44:40 +0000
committerGiuseppe Rossini <giuseppe.rossini@arm.com>2019-04-26 15:16:46 +0000
commit14c4e0f1c3e3bfebb33eb1ceb8156b3a090f4d11 (patch)
treeb28e4be597f8dcd1fed4408cebb130af49272390
parent098516b74a0948707119c64c641a0a8fdf529454 (diff)
downloadComputeLibrary-14c4e0f1c3e3bfebb33eb1ceb8156b3a090f4d11.tar.gz
COMPMID-1961: Implement FFT (1D) on NEON
Change-Id: I0bea3bfbc3b0cd9e8c9a0e0f6f430640573f08d1 Signed-off-by: giuros01 <giuseppe.rossini@arm.com> Reviewed-on: https://review.mlplatform.org/c/996 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--arm_compute/core/NEON/NEKernels.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h83
-rw-r--r--arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h98
-rw-r--r--arm_compute/runtime/NEON/NEFunctions.h1
-rw-r--r--arm_compute/runtime/NEON/functions/NEFFT1D.h79
-rw-r--r--src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp114
-rw-r--r--src/core/NEON/kernels/NEFFTRadixStageKernel.cpp865
-rw-r--r--src/runtime/NEON/functions/NEFFT1D.cpp112
-rw-r--r--tests/validation/NEON/FFT.cpp135
9 files changed, 1489 insertions, 0 deletions
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 1ce3821e81..b8ae467c6d 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -62,6 +62,8 @@
#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
diff --git a/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h b/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h
new file mode 100644
index 0000000000..84d55fd8f4
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H__
+#define __ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the digit reverse operation kernel. */
+class NEFFTDigitReverseKernel : public INEKernel
+{
+public:
+ const char *name() const override
+ {
+ return "NEFFTDigitReverseKernel";
+ }
+ /** Constructor */
+ NEFFTDigitReverseKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEFFTDigitReverseKernel(const NEFFTDigitReverseKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEFFTDigitReverseKernel &operator=(const NEFFTDigitReverseKernel &) = delete;
+ /** Default Move Constructor. */
+ NEFFTDigitReverseKernel(NEFFTDigitReverseKernel &&) = default;
+ /** Default move assignment operator */
+ NEFFTDigitReverseKernel &operator=(NEFFTDigitReverseKernel &&) = default;
+ /** Default destructor */
+ ~NEFFTDigitReverseKernel() = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data types supported: F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] idx Digit reverse index tensor. Data type supported: U32
+ * @param[in] axis Axis to perform digit reverse on.
+ */
+ void configure(const ITensor *input, ITensor *output, const ITensor *idx, unsigned int axis);
+ /** Static function to check if given info will lead to a valid configuration of @ref NEFFTDigitReverseKernel
+ *
+ * @param[in] input Source tensor info. Data types supported: F32.
+ * @param[in] output Destination tensor info. Data type supported: same as @p input
+ * @param[in] idx Digit reverse index tensor info. Data type supported: U32
+ * @param[in] axis Axis to perform digit reverse on.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, unsigned int axis);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ const ITensor *_input;
+ ITensor *_output;
+ const ITensor *_idx;
+ unsigned int _axis;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h b/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h
new file mode 100644
index 0000000000..a4c4be6f35
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H__
+#define __ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include <set>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Interface for the FFT kernel. */
+class NEFFTRadixStageKernel : public INEKernel
+{
+public:
+ const char *name() const override
+ {
+ return "NEFFTRadixStageKernel";
+ }
+ /** Constructor */
+ NEFFTRadixStageKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEFFTRadixStageKernel(const NEFFTRadixStageKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEFFTRadixStageKernel &operator=(const NEFFTRadixStageKernel &) = delete;
+ /** Default Move Constructor. */
+ NEFFTRadixStageKernel(NEFFTRadixStageKernel &&) = default;
+ /** Default move assignment operator */
+ NEFFTRadixStageKernel &operator=(NEFFTRadixStageKernel &&) = default;
+ /** Default destructor */
+ ~NEFFTRadixStageKernel() = default;
+ /** Set the input and output tensors.
+ *
+ * @note If the output tensor is nullptr, the FFT will be performed in-place
+ *
+ * @param[in,out] input Source tensor. Data types supported: F32.
+ * @param[out] output Destination tensor. Data type supported: same as @p input
+ * @param[in] config FFT descriptor metadata.
+ */
+ void configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config);
+ /** Static function to check if given info will lead to a valid configuration of @ref NEFFTRadixStageKernel
+ *
+ * @param[in] input Source tensor info. Data types supported: F32.
+ * @param[in] output Destination tensor info. Data type supported: same as @p input
+ * @param[in] config FFT descriptor metadata.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config);
+ /** Returns the radix that are support by the FFT kernel
+ *
+ * @return A set of supported radix
+ */
+ static std::set<unsigned int> supported_radix();
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ ITensor *_input;
+ ITensor *_output;
+ bool _run_in_place;
+ unsigned int _Nx;
+
+ template <bool first_stage>
+ void set_radix_stage_fun(unsigned int radix);
+
+ using FFTFunctionPointerInPlace = std::function<void(float *, float *, unsigned int, unsigned int)>;
+ FFTFunctionPointerInPlace _func;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEFFTKERNEL_H__ */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 432c751308..d8f54ea231 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -63,6 +63,7 @@
#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
#include "arm_compute/runtime/NEON/functions/NEErode.h"
+#include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
#include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
#include "arm_compute/runtime/NEON/functions/NEFillBorder.h"
#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
new file mode 100644
index 0000000000..9b5ada746a
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFFT1D_H__
+#define __ARM_COMPUTE_NEFFT1D_H__
+
+#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+
+/** Basic function to execute one dimensional FFT. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFFTDigitReverseKernel Performs digit reverse
+ * -# @ref NEFFTRadixStageKernel A list of FFT kernels depending on the radix decomposition
+ */
+class NEFFT1D : public IFunction
+{
+public:
+ /** Default Constructor */
+ NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Initialise the function's source, destinations and border mode.
+ *
+ * @param[in] input Source tensor. Data types supported: F32.
+ * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
+ * @param[in] config FFT related configuration
+ */
+ void configure(const ITensor *input, ITensor *output, const FFT1DInfo &config);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLFFT1D.
+ *
+ * @param[in] input Source tensor info. Data types supported: F32.
+ * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
+ * @param[in] config FFT related configuration
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config);
+
+ // Inherited methods overridden:
+ void run() override;
+
+protected:
+ MemoryGroup _memory_group;
+ Tensor _digit_reversed_input;
+ Tensor _digit_reverse_indices;
+ NEFFTDigitReverseKernel _digit_reverse_kernel;
+ std::vector<NEFFTRadixStageKernel> _fft_kernels;
+ unsigned int _n_ffts;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEFFT1D_H__ */
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
new file mode 100644
index 0000000000..845fcef4f3
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, unsigned int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis != 0);
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, unsigned int axis)
+{
+ ARM_COMPUTE_UNUSED(idx, axis);
+
+ auto_init_if_empty(*output, *input);
+
+ Window win = calculate_max_window(*output, Steps());
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEFFTDigitReverseKernel::NEFFTDigitReverseKernel()
+ : _input(nullptr), _output(nullptr), _idx(nullptr), _axis(0)
+{
+}
+
+void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, unsigned int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), axis));
+
+ _input = input;
+ _output = output;
+ _idx = idx;
+ _axis = axis;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), idx->info(), axis);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, unsigned int axis)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), axis).first);
+ return Status{};
+}
+
+void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ Iterator out(_output, window);
+ const size_t element_size = _input->info()->element_size();
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ unsigned int in_index_1d = *reinterpret_cast<unsigned int *>(_idx->ptr_to_element(Coordinates(id.x())));
+
+ auto reverse_id = id;
+ reverse_id.set(_axis, in_index_1d);
+
+ memcpy(out.ptr(), _input->ptr_to_element(reverse_id), 2 * element_size);
+
+ },
+ out);
+
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
new file mode 100644
index 0000000000..b264590791
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -0,0 +1,865 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/traits.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <complex>
+
+namespace arm_compute
+{
+namespace
+{
+constexpr float PI = 3.141592653589793f;
+
+float32x2_t c_mul_neon(float32x2_t a, float32x2_t b)
+{
+ float32x2_t tmp = wrapper::vmul(a, b);
+
+ const float P1 = wrapper::vgetlane(tmp, 0);
+ const float P2 = wrapper::vgetlane(tmp, 1);
+
+ const float a_r = wrapper::vgetlane(a, 0);
+ const float a_i = wrapper::vgetlane(a, 1);
+ const float b_r = wrapper::vgetlane(b, 0);
+ const float b_i = wrapper::vgetlane(b, 1);
+
+ const float P3 = (a_r + a_i) * (b_r + b_i);
+ float32x2_t out = { P1 - P2, P3 - P2 - P1 };
+ return out;
+}
+
+float32x2_t c_mul_neon_img(float32x2_t a, float img_constant)
+{
+ const float a_r = wrapper::vgetlane(a, 0);
+ const float a_i = wrapper::vgetlane(a, 1);
+
+ const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
+ return out;
+}
+
+float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_t d, float32x2_t e)
+{
+ const auto t0 = wrapper::vadd(a, b);
+ const auto t1 = wrapper::vadd(c, d);
+ const auto t2 = wrapper::vadd(t0, t1);
+ return wrapper::vadd(t2, e);
+}
+
+float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
+{
+ const auto t0 = wrapper::vadd(x1, x2);
+ const auto t1 = wrapper::vadd(x3, x4);
+ const auto t2 = wrapper::vadd(x5, x6);
+ const auto t00 = wrapper::vadd(t0, t1);
+ const auto t01 = wrapper::vadd(t2, x7);
+
+ return wrapper::vadd(t00, t01);
+}
+
+float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
+{
+ const auto t0 = wrapper::vadd(x1, x2);
+ const auto t1 = wrapper::vadd(x3, x4);
+ const auto t2 = wrapper::vadd(x5, x6);
+ const auto t3 = wrapper::vadd(x7, x8);
+ const auto t00 = wrapper::vadd(t0, t1);
+ const auto t01 = wrapper::vadd(t2, t3);
+
+ return wrapper::vadd(t00, t01);
+}
+
+void fft_2(float32x2_t &x, float32x2_t &y, float32x2_t &w)
+{
+ float32x2_t a = x;
+ float32x2_t b = c_mul_neon(w, y);
+
+ x = wrapper::vadd(a, b);
+ y = wrapper::vsub(a, b);
+}
+
+constexpr float sqrt3div2 = 0.866025403784438;
+void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w, const float32x2_t &w2)
+{
+ float32x2_t a = x;
+ float32x2_t b = c_mul_neon(w, y);
+ float32x2_t c = c_mul_neon(w2, z);
+
+ x = wrapper::vadd(a, b);
+ x = wrapper::vadd(x, c);
+
+ const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c));
+ const auto v2 = c_mul_neon(float32x2_t{ 0.f, -sqrt3div2 }, wrapper::vsub(b, c));
+
+ y = z = wrapper::vsub(a, v1);
+ y = wrapper::vadd(y, v2);
+ z = wrapper::vsub(z, v2);
+}
+
+void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3)
+{
+ float32x2_t a = x1;
+ float32x2_t b = c_mul_neon(w, x2);
+ float32x2_t c = c_mul_neon(w2, x3);
+ float32x2_t d = c_mul_neon(w3, x4);
+
+ const auto x11 = wrapper::vadd(a, b);
+ const auto x12 = wrapper::vadd(c, d);
+ x1 = wrapper::vadd(x11, x12);
+
+ const auto x21 = wrapper::vadd(a, c_mul_neon_img(b, -1));
+ const auto x22 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, 1.f));
+ x2 = wrapper::vadd(x21, x22);
+
+ const auto x31 = wrapper::vadd(a, wrapper::vneg(b));
+ const auto x32 = wrapper::vadd(c, wrapper::vneg(d));
+ x3 = wrapper::vadd(x31, x32);
+
+ const auto x41 = wrapper::vadd(a, c_mul_neon_img(b, 1));
+ const auto x42 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, -1));
+ x4 = wrapper::vadd(x41, x42);
+}
+
+constexpr float W5_0 = 0.30901699437494f;
+constexpr float W5_1 = 0.95105651629515f;
+constexpr float W5_2 = 0.80901699437494f;
+constexpr float W5_3 = 0.58778525229247f;
+void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4)
+{
+ const auto a = x1;
+ const auto b = c_mul_neon(w, x2);
+ const auto c = c_mul_neon(w2, x3);
+ const auto d = c_mul_neon(w3, x4);
+ const auto e = c_mul_neon(w4, x5);
+
+ const auto b0 = c_mul_neon(float32x2_t{ W5_0, -W5_1 }, b);
+ const auto b1 = c_mul_neon(float32x2_t{ -W5_2, -W5_3 }, b);
+ const auto b2 = c_mul_neon(float32x2_t{ -W5_2, W5_3 }, b);
+ const auto b3 = c_mul_neon(float32x2_t{ W5_0, W5_1 }, b);
+
+ const auto c0 = c_mul_neon(float32x2_t{ -W5_2, -W5_3 }, c);
+ const auto c1 = c_mul_neon(float32x2_t{ W5_0, W5_1 }, c);
+ const auto c2 = c_mul_neon(float32x2_t{ W5_0, -W5_1 }, c);
+ const auto c3 = c_mul_neon(float32x2_t{ -W5_2, W5_3 }, c);
+
+ const auto d0 = c_mul_neon(float32x2_t{ -W5_2, W5_3 }, d);
+ const auto d1 = c_mul_neon(float32x2_t{ W5_0, -W5_1 }, d);
+ const auto d2 = c_mul_neon(float32x2_t{ W5_0, W5_1 }, d);
+ const auto d3 = c_mul_neon(float32x2_t{ -W5_2, -W5_3 }, d);
+
+ const auto e0 = c_mul_neon(float32x2_t{ W5_0, W5_1 }, e);
+ const auto e1 = c_mul_neon(float32x2_t{ -W5_2, W5_3 }, e);
+ const auto e2 = c_mul_neon(float32x2_t{ -W5_2, -W5_3 }, e);
+ const auto e3 = c_mul_neon(float32x2_t{ W5_0, -W5_1 }, e);
+
+ x1 = reduce_sum_5(a, b, c, d, e);
+ x2 = reduce_sum_5(a, b0, c0, d0, e0);
+ x3 = reduce_sum_5(a, b1, c1, d1, e1);
+ x4 = reduce_sum_5(a, b2, c2, d2, e2);
+ x5 = reduce_sum_5(a, b3, c3, d3, e3);
+}
+
+constexpr float W7_0 = 0.62348980185873f;
+constexpr float W7_1 = 0.78183148246802f;
+constexpr float W7_2 = 0.22252093395631f;
+constexpr float W7_3 = 0.97492791218182f;
+constexpr float W7_4 = 0.90096886790241f;
+constexpr float W7_5 = 0.43388373911755f;
+void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3,
+ const float32x2_t &w4,
+ const float32x2_t &w5, const float32x2_t &w6)
+{
+ const auto a = x1;
+ const auto b = c_mul_neon(w, x2);
+ const auto c = c_mul_neon(w2, x3);
+ const auto d = c_mul_neon(w3, x4);
+ const auto e = c_mul_neon(w4, x5);
+ const auto f = c_mul_neon(w5, x6);
+ const auto g = c_mul_neon(w6, x7);
+
+ const auto b0 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, b);
+ const auto b1 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, b);
+ const auto b2 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, b);
+ const auto b3 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, b);
+ const auto b4 = c_mul_neon(float32x2_t{ -W7_2, W7_3 }, b);
+ const auto b5 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, b);
+
+ const auto c0 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, c);
+ const auto c1 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, c);
+ const auto c2 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, c);
+ const auto c3 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, c);
+ const auto c4 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, c);
+ const auto c5 = c_mul_neon(float32x2_t{ -W7_2, W7_3 }, c);
+
+ const auto d0 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, d);
+ const auto d1 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, d);
+ const auto d2 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, d);
+ const auto d3 = c_mul_neon(float32x2_t{ -W7_2, +W7_3 }, d);
+ const auto d4 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, d);
+ const auto d5 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, d);
+
+ const auto e0 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, e);
+ const auto e1 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, e);
+ const auto e2 = c_mul_neon(float32x2_t{ -W7_2, W7_3 }, e);
+ const auto e3 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, e);
+ const auto e4 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, e);
+ const auto e5 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, e);
+
+ const auto f0 = c_mul_neon(float32x2_t{ -W7_2, W7_3 }, f);
+ const auto f1 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, f);
+ const auto f2 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, f);
+ const auto f3 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, f);
+ const auto f4 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, f);
+ const auto f5 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, f);
+
+ const auto g0 = c_mul_neon(float32x2_t{ W7_0, W7_1 }, g);
+ const auto g1 = c_mul_neon(float32x2_t{ -W7_2, W7_3 }, g);
+ const auto g2 = c_mul_neon(float32x2_t{ -W7_4, W7_5 }, g);
+ const auto g3 = c_mul_neon(float32x2_t{ -W7_4, -W7_5 }, g);
+ const auto g4 = c_mul_neon(float32x2_t{ -W7_2, -W7_3 }, g);
+ const auto g5 = c_mul_neon(float32x2_t{ W7_0, -W7_1 }, g);
+
+ x1 = reduce_sum_7(a, b, c, d, e, f, g);
+ x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
+ x3 = reduce_sum_7(a, b1, c1, d1, e1, f1, g1);
+ x4 = reduce_sum_7(a, b2, c2, d2, e2, f2, g2);
+ x5 = reduce_sum_7(a, b3, c3, d3, e3, f3, g3);
+ x6 = reduce_sum_7(a, b4, c4, d4, e4, f4, g4);
+ x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
+}
+
+constexpr float sqrt2div2 = 0.707106781186548;
+void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2,
+ const float32x2_t &w3,
+ const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6,
+ const float32x2_t &w7)
+{
+ const auto a = x1;
+ const auto b = c_mul_neon(w, x2);
+ const auto c = c_mul_neon(w2, x3);
+ const auto d = c_mul_neon(w3, x4);
+ const auto e = c_mul_neon(w4, x5);
+ const auto f = c_mul_neon(w5, x6);
+ const auto g = c_mul_neon(w6, x7);
+ const auto h = c_mul_neon(w7, x8);
+
+ const auto b0 = c_mul_neon(float32x2_t{ sqrt2div2, -sqrt2div2 }, b);
+ const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b);
+ const auto b2 = c_mul_neon(float32x2_t{ -sqrt2div2, -sqrt2div2 }, b);
+ const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b);
+ const auto b4 = c_mul_neon(float32x2_t{ -sqrt2div2, sqrt2div2 }, b);
+ const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b);
+ const auto b6 = c_mul_neon(float32x2_t{ sqrt2div2, sqrt2div2 }, b);
+
+ const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
+ const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
+ const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
+ const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
+ const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
+ const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
+ const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
+
+ const auto d0 = c_mul_neon(float32x2_t{ -sqrt2div2, -sqrt2div2 }, d);
+ const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
+ const auto d2 = c_mul_neon(float32x2_t{ sqrt2div2, -sqrt2div2 }, d);
+ const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
+ const auto d4 = c_mul_neon(float32x2_t{ sqrt2div2, sqrt2div2 }, d);
+ const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
+ const auto d6 = c_mul_neon(float32x2_t{ -sqrt2div2, sqrt2div2 }, d);
+
+ const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+ const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
+ const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+ const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
+ const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+ const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
+ const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+
+ const auto f0 = c_mul_neon(float32x2_t{ -sqrt2div2, sqrt2div2 }, f);
+ const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
+ const auto f2 = c_mul_neon(float32x2_t{ sqrt2div2, sqrt2div2 }, f);
+ const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
+ const auto f4 = c_mul_neon(float32x2_t{ sqrt2div2, -sqrt2div2 }, f);
+ const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
+ const auto f6 = c_mul_neon(float32x2_t{ -sqrt2div2, -sqrt2div2 }, f);
+
+ const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
+ const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
+ const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
+ const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
+ const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
+ const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
+ const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
+
+ const auto h0 = c_mul_neon(float32x2_t{ sqrt2div2, sqrt2div2 }, h);
+ const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
+ const auto h2 = c_mul_neon(float32x2_t{ -sqrt2div2, sqrt2div2 }, h);
+ const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
+ const auto h4 = c_mul_neon(float32x2_t{ -sqrt2div2, -sqrt2div2 }, h);
+ const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
+ const auto h6 = c_mul_neon(float32x2_t{ sqrt2div2, -sqrt2div2 }, h);
+
+ x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
+ x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
+ x3 = reduce_sum_8(a, b1, c1, d1, e1, f1, g1, h1);
+ x4 = reduce_sum_8(a, b2, c2, d2, e2, f2, g2, h2);
+ x5 = reduce_sum_8(a, b3, c3, d3, e3, f3, g3, h3);
+ x6 = reduce_sum_8(a, b4, c4, d4, e4, f4, g4, h4);
+ x7 = reduce_sum_8(a, b5, c5, d5, e5, f5, g5, h5);
+ x8 = reduce_sum_8(a, b6, c6, d6, e6, f6, g6, h6);
+}
+
+template <bool first_stage>
+void fft_radix_2_axes_0(float *X, float *x, unsigned int Nx, unsigned int N)
+{
+ unsigned int Nx2 = 2 * Nx;
+ float alpha = 2 * PI / Nx2;
+
+ float32x2_t w{ 1, 0 };
+ const float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
+
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx2)
+ {
+ auto a = float32x2_t{ 0, 0 };
+ auto b = float32x2_t{ 0, 0 };
+
+ // Load inputs
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ }
+ else
+ {
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ }
+
+ // Base-case prime transform
+ fft_2(a, b, w);
+
+ // Write outputs
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ }
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+template <bool first_stage>
+void fft_radix_3_axes_0(float *X, float *x, unsigned int Nx, unsigned int N)
+{
+ const unsigned int Nx3 = 3 * Nx;
+ const float alpha = 2 * PI / float(Nx3);
+ float32x2_t w{ 1, 0 };
+ const float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
+
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const auto w2 = c_mul_neon(w, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx3)
+ {
+ // Load inputs
+ float32x2_t a = { 0, 0 };
+ float32x2_t b = { 0, 0 };
+ float32x2_t c = { 0, 0 };
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ }
+ else
+ {
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ }
+ c = wrapper::vload(x + k + 4 * Nx);
+
+ // Base-case prime transform
+ fft_3(a, b, c, w, w2);
+
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ }
+ wrapper::vstore(X + k + 4 * Nx, c);
+ }
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+template <bool first_stage>
+void fft_radix_4_axes_0(float *X, float *x, unsigned int Nx, unsigned int N)
+{
+ unsigned int Nx4 = 4 * Nx;
+ const float alpha = 2 * PI / float(Nx4);
+
+ float32x2_t w{ 1, 0 };
+ float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
+
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const auto w2 = c_mul_neon(w, w);
+ const auto w3 = c_mul_neon(w2, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx4)
+ {
+ float32x2_t a = { 0, 0 };
+ float32x2_t b = { 0, 0 };
+ float32x2_t c = { 0, 0 };
+ float32x2_t d = { 0, 0 };
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ c = wrapper::vgetlow(cd);
+ d = wrapper::vgethigh(cd);
+ }
+ else
+ {
+ // Load inputs
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ c = wrapper::vload(x + k + 4 * Nx);
+ d = wrapper::vload(x + k + 6 * Nx);
+ }
+
+ // Base-case prime transform
+ fft_4(a, b, c, d, w, w2, w3);
+
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ wrapper::vstore(X + k + 4 * Nx, c);
+ wrapper::vstore(X + k + 6 * Nx, d);
+ }
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+template <bool first_stage>
+void fft_radix_5_axes_0(float *X, float *x, unsigned int Nx, unsigned int N)
+{
+ unsigned int Nx5 = 5 * Nx;
+ const float alpha = 2 * PI / float(Nx5);
+
+ float32x2_t w{ 1, 0 };
+ float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
+
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const float32x2_t w2 = c_mul_neon(w, w);
+ const float32x2_t w3 = c_mul_neon(w2, w);
+ const float32x2_t w4 = c_mul_neon(w3, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx5)
+ {
+ float32x2_t a = { 0, 0 };
+ float32x2_t b = { 0, 0 };
+ float32x2_t c = { 0, 0 };
+ float32x2_t d = { 0, 0 };
+ float32x2_t e = { 0, 0 };
+
+ // Load inputs
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ c = wrapper::vgetlow(cd);
+ d = wrapper::vgethigh(cd);
+ }
+ else
+ {
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ c = wrapper::vload(x + k + 4 * Nx);
+ d = wrapper::vload(x + k + 6 * Nx);
+ }
+ e = wrapper::vload(x + k + 8 * Nx);
+
+ // Base-case prime transform
+ fft_5(a, b, c, d, e, w, w2, w3, w4);
+
+ // Store outputs
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ wrapper::vstore(X + k + 4 * Nx, c);
+ wrapper::vstore(X + k + 6 * Nx, d);
+ }
+ wrapper::vstore(X + k + 8 * Nx, e);
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+template <bool first_stage>
+void fft_radix_7_axes_0(float *X, float *x, unsigned int Nx, unsigned int N)
+{
+ unsigned int Nx7 = 7 * Nx;
+ const float alpha = 2 * PI / float(Nx7);
+
+ float32x2_t w{ 1, 0 };
+ float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
+
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const float32x2_t w2 = c_mul_neon(w, w);
+ const float32x2_t w3 = c_mul_neon(w2, w);
+ const float32x2_t w4 = c_mul_neon(w3, w);
+ const float32x2_t w5 = c_mul_neon(w4, w);
+ const float32x2_t w6 = c_mul_neon(w5, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx7)
+ {
+ float32x2_t a = { 0, 0 };
+ float32x2_t b = { 0, 0 };
+ float32x2_t c = { 0, 0 };
+ float32x2_t d = { 0, 0 };
+ float32x2_t e = { 0, 0 };
+ float32x2_t f = { 0, 0 };
+ float32x2_t g = { 0, 0 };
+
+ // Load inputs
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+ const auto ef = wrapper::vloadq(x + k + 8 * Nx);
+
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ c = wrapper::vgetlow(cd);
+ d = wrapper::vgethigh(cd);
+ e = wrapper::vgetlow(ef);
+ f = wrapper::vgethigh(ef);
+ }
+ else
+ {
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ c = wrapper::vload(x + k + 4 * Nx);
+ d = wrapper::vload(x + k + 6 * Nx);
+ e = wrapper::vload(x + k + 8 * Nx);
+ f = wrapper::vload(x + k + 10 * Nx);
+ }
+ g = wrapper::vload(x + k + 12 * Nx);
+
+ // Base-case prime transform
+ fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
+
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+ wrapper::vstore(X + k + 8 * Nx, wrapper::vcombine(e, f));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ wrapper::vstore(X + k + 4 * Nx, c);
+ wrapper::vstore(X + k + 6 * Nx, d);
+ wrapper::vstore(X + k + 8 * Nx, e);
+ wrapper::vstore(X + k + 10 * Nx, f);
+ }
+ wrapper::vstore(X + k + 12 * Nx, g);
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+template <bool first_stage>
+void fft_radix_8_axes_0(float *X, float *x, unsigned int Nx, unsigned int N)
+{
+ unsigned int Nx8 = 8 * Nx;
+ const float alpha = 2 * PI / float(Nx8);
+
+ float32x2_t w{ 1, 0 };
+ const float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
+
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const float32x2_t w2 = c_mul_neon(w, w);
+ const float32x2_t w3 = c_mul_neon(w2, w);
+ const float32x2_t w4 = c_mul_neon(w3, w);
+ const float32x2_t w5 = c_mul_neon(w4, w);
+ const float32x2_t w6 = c_mul_neon(w5, w);
+ const float32x2_t w7 = c_mul_neon(w6, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * Nx8)
+ {
+ // Load inputs
+ float32x2_t a = { 0, 0 };
+ float32x2_t b = { 0, 0 };
+ float32x2_t c = { 0, 0 };
+ float32x2_t d = { 0, 0 };
+ float32x2_t e = { 0, 0 };
+ float32x2_t f = { 0, 0 };
+ float32x2_t g = { 0, 0 };
+ float32x2_t h = { 0, 0 };
+
+ // Base-case prime transform
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+ const auto ef = wrapper::vloadq(x + k + 8 * Nx);
+ const auto gh = wrapper::vloadq(x + k + 12 * Nx);
+
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ c = wrapper::vgetlow(cd);
+ d = wrapper::vgethigh(cd);
+ e = wrapper::vgetlow(ef);
+ f = wrapper::vgethigh(ef);
+ g = wrapper::vgetlow(gh);
+ h = wrapper::vgethigh(gh);
+ }
+ else
+ {
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ c = wrapper::vload(x + k + 4 * Nx);
+ d = wrapper::vload(x + k + 6 * Nx);
+ e = wrapper::vload(x + k + 8 * Nx);
+ f = wrapper::vload(x + k + 10 * Nx);
+ g = wrapper::vload(x + k + 12 * Nx);
+ h = wrapper::vload(x + k + 14 * Nx);
+ }
+
+ // Apply twiddle factors
+ fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
+
+ // Store outputs
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+ wrapper::vstore(X + k + 8 * Nx, wrapper::vcombine(e, f));
+ wrapper::vstore(X + k + 12 * Nx, wrapper::vcombine(g, h));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ wrapper::vstore(X + k + 4 * Nx, c);
+ wrapper::vstore(X + k + 6 * Nx, d);
+ wrapper::vstore(X + k + 8 * Nx, e);
+ wrapper::vstore(X + k + 10 * Nx, f);
+ wrapper::vstore(X + k + 12 * Nx, g);
+ wrapper::vstore(X + k + 14 * Nx, h);
+ }
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(config.axis != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(NEFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+ if(output != nullptr)
+ {
+ auto_init_if_empty(*output, *input);
+ }
+
+ Window win = calculate_max_window(*input, Steps(config.radix));
+ if(output != nullptr)
+ {
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEFFTRadixStageKernel::NEFFTRadixStageKernel()
+ : _input(nullptr), _output(nullptr), _run_in_place(false), _Nx(0), _func()
+{
+}
+
+template <bool first_stage>
+void NEFFTRadixStageKernel::set_radix_stage_fun(unsigned int radix)
+{
+ switch(radix)
+ {
+ case 2:
+ _func = &fft_radix_2_axes_0<first_stage>;
+ break;
+ case 3:
+ _func = &fft_radix_3_axes_0<first_stage>;
+ break;
+ case 4:
+ _func = &fft_radix_4_axes_0<first_stage>;
+ break;
+ case 5:
+ _func = &fft_radix_5_axes_0<first_stage>;
+ break;
+ case 7:
+ _func = &fft_radix_7_axes_0<first_stage>;
+ break;
+ case 8:
+ _func = &fft_radix_8_axes_0<first_stage>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Radix not supported");
+ }
+}
+
+void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ // Output auto inizialitation if not yet initialized
+ if(output != nullptr)
+ {
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+ }
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+
+ _input = input;
+ _output = output;
+ _run_in_place = (output == nullptr) || (output == input);
+ _Nx = config.Nx;
+
+ if(config.is_first_stage)
+ {
+ set_radix_stage_fun<true>(config.radix);
+ }
+ else
+ {
+ set_radix_stage_fun<false>(config.radix);
+ }
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(), config);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+ const bool run_in_place = (output == nullptr) || (output == input);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ (run_in_place) ? nullptr : output->clone().get(),
+ config)
+ .first);
+
+ return Status{};
+}
+
+std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()
+{
+ return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+}
+
+void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ Window input_window = window;
+ input_window.set(Window::DimX, 0);
+
+ unsigned int N = _input->info()->dimension(0);
+
+ Iterator in(_input, input_window);
+ Iterator out(_run_in_place ? _input : _output, input_window);
+
+ execute_window_loop(input_window, [&](const Coordinates &)
+ {
+ _func(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, N);
+ },
+ in, out);
+
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
new file mode 100644
index 0000000000..d3ff674a2a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _digit_reversed_input(), _digit_reverse_indices(), _digit_reverse_kernel(), _fft_kernels(), _n_ffts(0)
+{
+}
+
+void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &config)
+{
+ // Decompose size to radix factors
+ const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
+ const unsigned int N = input->info()->tensor_shape()[config.axis];
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+ ARM_COMPUTE_ERROR_ON(decomposed_vector.empty());
+
+ // Configure digit reverse
+ TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
+ _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
+ _memory_group.manage(&_digit_reversed_input);
+ _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, config.axis);
+
+ // Create and configure FFT kernels
+ unsigned int Nx = 1;
+ _n_ffts = decomposed_vector.size();
+ _fft_kernels.resize(_n_ffts);
+ for(unsigned int i = 0; i < _n_ffts; ++i)
+ {
+ const unsigned int radix_for_stage = decomposed_vector.at(i);
+
+ FFTRadixStageKernelInfo fft_kernel_desc;
+ fft_kernel_desc.axis = config.axis;
+ fft_kernel_desc.radix = radix_for_stage;
+ fft_kernel_desc.Nx = Nx;
+ fft_kernel_desc.is_first_stage = (i == 0);
+ _fft_kernels[i].configure(&_digit_reversed_input, i == (_n_ffts - 1) ? output : nullptr, fft_kernel_desc);
+
+ Nx *= radix_for_stage;
+ }
+
+ // Allocate tensors
+ _digit_reversed_input.allocator()->allocate();
+ _digit_reverse_indices.allocator()->allocate();
+
+ // Init digit reverse indices
+ const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector);
+ std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast<unsigned int *>(_digit_reverse_indices.buffer()));
+}
+
+Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(config.axis != 0);
+
+ // Check if FFT is decomposable
+ const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
+ const unsigned int N = input->tensor_shape()[config.axis];
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+ ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+void NEFFT1D::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ NEScheduler::get().schedule(&_digit_reverse_kernel, Window::DimY);
+
+ for(unsigned int i = 0; i < _n_ffts; ++i)
+ {
+ NEScheduler::get().schedule(&_fft_kernels[i], Window::DimY);
+ }
+}
+} // namespace arm_compute
diff --git a/tests/validation/NEON/FFT.cpp b/tests/validation/NEON/FFT.cpp
new file mode 100644
index 0000000000..14fb7e2518
--- /dev/null
+++ b/tests/validation/NEON/FFT.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/FFTFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+const auto data_types = framework::dataset::make("DataType", { DataType::F32 });
+const auto shapes_1d = framework::dataset::make("TensorShape", { TensorShape(2U, 2U, 3U), TensorShape(3U, 2U, 3U),
+ TensorShape(4U, 2U, 3U), TensorShape(5U, 2U, 3U),
+ TensorShape(7U, 2U, 3U), TensorShape(8U, 2U, 3U),
+ TensorShape(9U, 2U, 3U), TensorShape(25U, 2U, 3U),
+ TensorShape(49U, 2U, 3U), TensorShape(64U, 2U, 3U),
+ TensorShape(16U, 2U, 3U), TensorShape(32U, 2U, 3U),
+ TensorShape(96U, 2U, 2U)
+ });
+
+const auto ActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
+{
+ ActivationLayerInfo(),
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+});
+
+RelativeTolerance<float> tolerance_f32(0.1f); /**< Relative tolerance value for FP32 */
+constexpr float tolerance_num = 0.07f; /**< Tolerance number */
+
+} // namespace
+TEST_SUITE(NEON)
+TEST_SUITE(FFT1D)
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(shapes_1d, data_types),
+ shape, data_type)
+{
+ // Create tensors
+ Tensor src = create_tensor<Tensor>(shape, data_type, 2);
+ Tensor dst = create_tensor<Tensor>(shape, data_type, 2);
+
+ ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Create and configure function
+ NEFFT1D fft1d;
+ fft1d.configure(&src, &dst, FFT1DInfo());
+
+ // Validate valid region
+ const ValidRegion valid_region = shape_to_valid_region(shape);
+ validate(src.info()->valid_region(), valid_region);
+ validate(dst.info()->valid_region(), valid_region);
+
+ // Validate padding
+ validate(src.info()->padding(), PaddingSize());
+ validate(dst.info()->padding(), PaddingSize());
+}
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+ framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), // Mismatching data types
+ TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), // Mismatching shapes
+ TensorInfo(TensorShape(32U, 13U, 2U), 3, DataType::F32), // Invalid channels
+ TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), // Unsupported axis
+ TensorInfo(TensorShape(11U, 13U, 2U), 2, DataType::F32), // Undecomposable FFT
+ TensorInfo(TensorShape(25U, 13U, 2U), 2, DataType::F32),
+ }),
+ framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F16),
+ TensorInfo(TensorShape(16U, 13U, 2U), 2, DataType::F32),
+ TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32),
+ TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32),
+ TensorInfo(TensorShape(11U, 13U, 2U), 2, DataType::F32),
+ TensorInfo(TensorShape(25U, 13U, 2U), 2, DataType::F32),
+ })),
+ framework::dataset::make("Axis", { 0, 0, 0, 2, 0, 0 })),
+ framework::dataset::make("Expected", { false, false, false, false, false, true })),
+ input_info, output_info, axis, expected)
+{
+ FFT1DInfo desc;
+ desc.axis = axis;
+ const Status s = NEFFT1D::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), desc);
+ ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using NEFFT1DFixture = FFTValidationFixture<Tensor, Accessor, NEFFT1D, FFT1DInfo, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFFT1DFixture<float>, framework::DatasetMode::ALL, combine(shapes_1d, framework::dataset::make("DataType", DataType::F32)))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, tolerance_f32, tolerance_num);
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+TEST_SUITE_END() // FFT1D
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute